]> Git Repo - linux.git/blob - net/ipv6/route.c
mm/slub.c: add a naive detection of double free or corruption
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <[email protected]>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 static size_t rt6_nlmsg_size(struct rt6_info *rt);
102 static int rt6_fill_node(struct net *net,
103                          struct sk_buff *skb, struct rt6_info *rt,
104                          struct in6_addr *dst, struct in6_addr *src,
105                          int iif, int type, u32 portid, u32 seq,
106                          unsigned int flags);
107
108 #ifdef CONFIG_IPV6_ROUTE_INFO
109 static struct rt6_info *rt6_add_route_info(struct net *net,
110                                            const struct in6_addr *prefix, int prefixlen,
111                                            const struct in6_addr *gwaddr,
112                                            struct net_device *dev,
113                                            unsigned int pref);
114 static struct rt6_info *rt6_get_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev);
118 #endif
119
120 struct uncached_list {
121         spinlock_t              lock;
122         struct list_head        head;
123 };
124
125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
126
127 static void rt6_uncached_list_add(struct rt6_info *rt)
128 {
129         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
130
131         rt->rt6i_uncached_list = ul;
132
133         spin_lock_bh(&ul->lock);
134         list_add_tail(&rt->rt6i_uncached, &ul->head);
135         spin_unlock_bh(&ul->lock);
136 }
137
138 static void rt6_uncached_list_del(struct rt6_info *rt)
139 {
140         if (!list_empty(&rt->rt6i_uncached)) {
141                 struct uncached_list *ul = rt->rt6i_uncached_list;
142
143                 spin_lock_bh(&ul->lock);
144                 list_del(&rt->rt6i_uncached);
145                 spin_unlock_bh(&ul->lock);
146         }
147 }
148
149 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
150 {
151         struct net_device *loopback_dev = net->loopback_dev;
152         int cpu;
153
154         if (dev == loopback_dev)
155                 return;
156
157         for_each_possible_cpu(cpu) {
158                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
159                 struct rt6_info *rt;
160
161                 spin_lock_bh(&ul->lock);
162                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
163                         struct inet6_dev *rt_idev = rt->rt6i_idev;
164                         struct net_device *rt_dev = rt->dst.dev;
165
166                         if (rt_idev->dev == dev) {
167                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
168                                 in6_dev_put(rt_idev);
169                         }
170
171                         if (rt_dev == dev) {
172                                 rt->dst.dev = loopback_dev;
173                                 dev_hold(rt->dst.dev);
174                                 dev_put(rt_dev);
175                         }
176                 }
177                 spin_unlock_bh(&ul->lock);
178         }
179 }
180
181 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
182 {
183         return dst_metrics_write_ptr(rt->dst.from);
184 }
185
186 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
187 {
188         struct rt6_info *rt = (struct rt6_info *)dst;
189
190         if (rt->rt6i_flags & RTF_PCPU)
191                 return rt6_pcpu_cow_metrics(rt);
192         else if (rt->rt6i_flags & RTF_CACHE)
193                 return NULL;
194         else
195                 return dst_cow_metrics_generic(dst, old);
196 }
197
198 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
199                                              struct sk_buff *skb,
200                                              const void *daddr)
201 {
202         struct in6_addr *p = &rt->rt6i_gateway;
203
204         if (!ipv6_addr_any(p))
205                 return (const void *) p;
206         else if (skb)
207                 return &ipv6_hdr(skb)->daddr;
208         return daddr;
209 }
210
211 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
212                                           struct sk_buff *skb,
213                                           const void *daddr)
214 {
215         struct rt6_info *rt = (struct rt6_info *) dst;
216         struct neighbour *n;
217
218         daddr = choose_neigh_daddr(rt, skb, daddr);
219         n = __ipv6_neigh_lookup(dst->dev, daddr);
220         if (n)
221                 return n;
222         return neigh_create(&nd_tbl, daddr, dst->dev);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(rt, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       ipv6_cow_metrics,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct rt6_info ip6_null_entry_template = {
293         .dst = {
294                 .__refcnt       = ATOMIC_INIT(1),
295                 .__use          = 1,
296                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
297                 .error          = -ENETUNREACH,
298                 .input          = ip6_pkt_discard,
299                 .output         = ip6_pkt_discard_out,
300         },
301         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
302         .rt6i_protocol  = RTPROT_KERNEL,
303         .rt6i_metric    = ~(u32) 0,
304         .rt6i_ref       = ATOMIC_INIT(1),
305 };
306
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309 static const struct rt6_info ip6_prohibit_entry_template = {
310         .dst = {
311                 .__refcnt       = ATOMIC_INIT(1),
312                 .__use          = 1,
313                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
314                 .error          = -EACCES,
315                 .input          = ip6_pkt_prohibit,
316                 .output         = ip6_pkt_prohibit_out,
317         },
318         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
319         .rt6i_protocol  = RTPROT_KERNEL,
320         .rt6i_metric    = ~(u32) 0,
321         .rt6i_ref       = ATOMIC_INIT(1),
322 };
323
324 static const struct rt6_info ip6_blk_hole_entry_template = {
325         .dst = {
326                 .__refcnt       = ATOMIC_INIT(1),
327                 .__use          = 1,
328                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
329                 .error          = -EINVAL,
330                 .input          = dst_discard,
331                 .output         = dst_discard_out,
332         },
333         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
334         .rt6i_protocol  = RTPROT_KERNEL,
335         .rt6i_metric    = ~(u32) 0,
336         .rt6i_ref       = ATOMIC_INIT(1),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_siblings);
347         INIT_LIST_HEAD(&rt->rt6i_uncached);
348 }
349
350 /* allocate dst with ip6_dst_ops */
351 static struct rt6_info *__ip6_dst_alloc(struct net *net,
352                                         struct net_device *dev,
353                                         int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt)
359                 rt6_info_init(rt);
360
361         return rt;
362 }
363
364 struct rt6_info *ip6_dst_alloc(struct net *net,
365                                struct net_device *dev,
366                                int flags)
367 {
368         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
369
370         if (rt) {
371                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
372                 if (rt->rt6i_pcpu) {
373                         int cpu;
374
375                         for_each_possible_cpu(cpu) {
376                                 struct rt6_info **p;
377
378                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
379                                 /* no one shares rt */
380                                 *p =  NULL;
381                         }
382                 } else {
383                         dst_release_immediate(&rt->dst);
384                         return NULL;
385                 }
386         }
387
388         return rt;
389 }
390 EXPORT_SYMBOL(ip6_dst_alloc);
391
392 static void ip6_dst_destroy(struct dst_entry *dst)
393 {
394         struct rt6_info *rt = (struct rt6_info *)dst;
395         struct dst_entry *from = dst->from;
396         struct inet6_dev *idev;
397
398         dst_destroy_metrics_generic(dst);
399         free_percpu(rt->rt6i_pcpu);
400         rt6_uncached_list_del(rt);
401
402         idev = rt->rt6i_idev;
403         if (idev) {
404                 rt->rt6i_idev = NULL;
405                 in6_dev_put(idev);
406         }
407
408         dst->from = NULL;
409         dst_release(from);
410 }
411
412 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
413                            int how)
414 {
415         struct rt6_info *rt = (struct rt6_info *)dst;
416         struct inet6_dev *idev = rt->rt6i_idev;
417         struct net_device *loopback_dev =
418                 dev_net(dev)->loopback_dev;
419
420         if (idev && idev->dev != loopback_dev) {
421                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
422                 if (loopback_idev) {
423                         rt->rt6i_idev = loopback_idev;
424                         in6_dev_put(idev);
425                 }
426         }
427 }
428
429 static bool __rt6_check_expired(const struct rt6_info *rt)
430 {
431         if (rt->rt6i_flags & RTF_EXPIRES)
432                 return time_after(jiffies, rt->dst.expires);
433         else
434                 return false;
435 }
436
437 static bool rt6_check_expired(const struct rt6_info *rt)
438 {
439         if (rt->rt6i_flags & RTF_EXPIRES) {
440                 if (time_after(jiffies, rt->dst.expires))
441                         return true;
442         } else if (rt->dst.from) {
443                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
444                        rt6_check_expired((struct rt6_info *)rt->dst.from);
445         }
446         return false;
447 }
448
449 /* Multipath route selection:
450  *   Hash based function using packet header and flowlabel.
451  * Adapted from fib_info_hashfn()
452  */
453 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
454                                const struct flowi6 *fl6)
455 {
456         return get_hash_from_flowi6(fl6) % candidate_count;
457 }
458
459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
460                                              struct flowi6 *fl6, int oif,
461                                              int strict)
462 {
463         struct rt6_info *sibling, *next_sibling;
464         int route_choosen;
465
466         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
467         /* Don't change the route, if route_choosen == 0
468          * (siblings does not include ourself)
469          */
470         if (route_choosen)
471                 list_for_each_entry_safe(sibling, next_sibling,
472                                 &match->rt6i_siblings, rt6i_siblings) {
473                         route_choosen--;
474                         if (route_choosen == 0) {
475                                 if (rt6_score_route(sibling, oif, strict) < 0)
476                                         break;
477                                 match = sibling;
478                                 break;
479                         }
480                 }
481         return match;
482 }
483
484 /*
485  *      Route lookup. Any table->tb6_lock is implied.
486  */
487
488 static inline struct rt6_info *rt6_device_match(struct net *net,
489                                                     struct rt6_info *rt,
490                                                     const struct in6_addr *saddr,
491                                                     int oif,
492                                                     int flags)
493 {
494         struct rt6_info *local = NULL;
495         struct rt6_info *sprt;
496
497         if (!oif && ipv6_addr_any(saddr))
498                 goto out;
499
500         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
501                 struct net_device *dev = sprt->dst.dev;
502
503                 if (oif) {
504                         if (dev->ifindex == oif)
505                                 return sprt;
506                         if (dev->flags & IFF_LOOPBACK) {
507                                 if (!sprt->rt6i_idev ||
508                                     sprt->rt6i_idev->dev->ifindex != oif) {
509                                         if (flags & RT6_LOOKUP_F_IFACE)
510                                                 continue;
511                                         if (local &&
512                                             local->rt6i_idev->dev->ifindex == oif)
513                                                 continue;
514                                 }
515                                 local = sprt;
516                         }
517                 } else {
518                         if (ipv6_chk_addr(net, saddr, dev,
519                                           flags & RT6_LOOKUP_F_IFACE))
520                                 return sprt;
521                 }
522         }
523
524         if (oif) {
525                 if (local)
526                         return local;
527
528                 if (flags & RT6_LOOKUP_F_IFACE)
529                         return net->ipv6.ip6_null_entry;
530         }
531 out:
532         return rt;
533 }
534
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537         struct work_struct work;
538         struct in6_addr target;
539         struct net_device *dev;
540 };
541
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544         struct in6_addr mcaddr;
545         struct __rt6_probe_work *work =
546                 container_of(w, struct __rt6_probe_work, work);
547
548         addrconf_addr_solict_mult(&work->target, &mcaddr);
549         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550         dev_put(work->dev);
551         kfree(work);
552 }
553
554 static void rt6_probe(struct rt6_info *rt)
555 {
556         struct __rt6_probe_work *work;
557         struct neighbour *neigh;
558         /*
559          * Okay, this does not seem to be appropriate
560          * for now, however, we need to check if it
561          * is really so; aka Router Reachability Probing.
562          *
563          * Router Reachability Probe MUST be rate-limited
564          * to no more than one per minute.
565          */
566         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567                 return;
568         rcu_read_lock_bh();
569         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570         if (neigh) {
571                 if (neigh->nud_state & NUD_VALID)
572                         goto out;
573
574                 work = NULL;
575                 write_lock(&neigh->lock);
576                 if (!(neigh->nud_state & NUD_VALID) &&
577                     time_after(jiffies,
578                                neigh->updated +
579                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else {
586                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
587         }
588
589         if (work) {
590                 INIT_WORK(&work->work, rt6_probe_deferred);
591                 work->target = rt->rt6i_gateway;
592                 dev_hold(rt->dst.dev);
593                 work->dev = rt->dst.dev;
594                 schedule_work(&work->work);
595         }
596
597 out:
598         rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611         struct net_device *dev = rt->dst.dev;
612         if (!oif || dev->ifindex == oif)
613                 return 2;
614         if ((dev->flags & IFF_LOOPBACK) &&
615             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616                 return 1;
617         return 0;
618 }
619
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622         struct neighbour *neigh;
623         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624
625         if (rt->rt6i_flags & RTF_NONEXTHOP ||
626             !(rt->rt6i_flags & RTF_GATEWAY))
627                 return RT6_NUD_SUCCEED;
628
629         rcu_read_lock_bh();
630         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631         if (neigh) {
632                 read_lock(&neigh->lock);
633                 if (neigh->nud_state & NUD_VALID)
634                         ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636                 else if (!(neigh->nud_state & NUD_FAILED))
637                         ret = RT6_NUD_SUCCEED;
638                 else
639                         ret = RT6_NUD_FAIL_PROBE;
640 #endif
641                 read_unlock(&neigh->lock);
642         } else {
643                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645         }
646         rcu_read_unlock_bh();
647
648         return ret;
649 }
650
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652                            int strict)
653 {
654         int m;
655
656         m = rt6_check_dev(rt, oif);
657         if (!m && (strict & RT6_LOOKUP_F_IFACE))
658                 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662         if (strict & RT6_LOOKUP_F_REACHABLE) {
663                 int n = rt6_check_neigh(rt);
664                 if (n < 0)
665                         return n;
666         }
667         return m;
668 }
669
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671                                    int *mpri, struct rt6_info *match,
672                                    bool *do_rr)
673 {
674         int m;
675         bool match_do_rr = false;
676         struct inet6_dev *idev = rt->rt6i_idev;
677         struct net_device *dev = rt->dst.dev;
678
679         if (dev && !netif_carrier_ok(dev) &&
680             idev->cnf.ignore_routes_with_linkdown &&
681             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
682                 goto out;
683
684         if (rt6_check_expired(rt))
685                 goto out;
686
687         m = rt6_score_route(rt, oif, strict);
688         if (m == RT6_NUD_FAIL_DO_RR) {
689                 match_do_rr = true;
690                 m = 0; /* lowest valid score */
691         } else if (m == RT6_NUD_FAIL_HARD) {
692                 goto out;
693         }
694
695         if (strict & RT6_LOOKUP_F_REACHABLE)
696                 rt6_probe(rt);
697
698         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
699         if (m > *mpri) {
700                 *do_rr = match_do_rr;
701                 *mpri = m;
702                 match = rt;
703         }
704 out:
705         return match;
706 }
707
708 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
709                                      struct rt6_info *rr_head,
710                                      u32 metric, int oif, int strict,
711                                      bool *do_rr)
712 {
713         struct rt6_info *rt, *match, *cont;
714         int mpri = -1;
715
716         match = NULL;
717         cont = NULL;
718         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
719                 if (rt->rt6i_metric != metric) {
720                         cont = rt;
721                         break;
722                 }
723
724                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
725         }
726
727         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
728                 if (rt->rt6i_metric != metric) {
729                         cont = rt;
730                         break;
731                 }
732
733                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
734         }
735
736         if (match || !cont)
737                 return match;
738
739         for (rt = cont; rt; rt = rt->dst.rt6_next)
740                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
741
742         return match;
743 }
744
745 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
746 {
747         struct rt6_info *match, *rt0;
748         struct net *net;
749         bool do_rr = false;
750
751         rt0 = fn->rr_ptr;
752         if (!rt0)
753                 fn->rr_ptr = rt0 = fn->leaf;
754
755         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
756                              &do_rr);
757
758         if (do_rr) {
759                 struct rt6_info *next = rt0->dst.rt6_next;
760
761                 /* no entries matched; do round-robin */
762                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
763                         next = fn->leaf;
764
765                 if (next != rt0)
766                         fn->rr_ptr = next;
767         }
768
769         net = dev_net(rt0->dst.dev);
770         return match ? match : net->ipv6.ip6_null_entry;
771 }
772
773 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
774 {
775         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
776 }
777
778 #ifdef CONFIG_IPV6_ROUTE_INFO
779 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
780                   const struct in6_addr *gwaddr)
781 {
782         struct net *net = dev_net(dev);
783         struct route_info *rinfo = (struct route_info *) opt;
784         struct in6_addr prefix_buf, *prefix;
785         unsigned int pref;
786         unsigned long lifetime;
787         struct rt6_info *rt;
788
789         if (len < sizeof(struct route_info)) {
790                 return -EINVAL;
791         }
792
793         /* Sanity check for prefix_len and length */
794         if (rinfo->length > 3) {
795                 return -EINVAL;
796         } else if (rinfo->prefix_len > 128) {
797                 return -EINVAL;
798         } else if (rinfo->prefix_len > 64) {
799                 if (rinfo->length < 2) {
800                         return -EINVAL;
801                 }
802         } else if (rinfo->prefix_len > 0) {
803                 if (rinfo->length < 1) {
804                         return -EINVAL;
805                 }
806         }
807
808         pref = rinfo->route_pref;
809         if (pref == ICMPV6_ROUTER_PREF_INVALID)
810                 return -EINVAL;
811
812         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
813
814         if (rinfo->length == 3)
815                 prefix = (struct in6_addr *)rinfo->prefix;
816         else {
817                 /* this function is safe */
818                 ipv6_addr_prefix(&prefix_buf,
819                                  (struct in6_addr *)rinfo->prefix,
820                                  rinfo->prefix_len);
821                 prefix = &prefix_buf;
822         }
823
824         if (rinfo->prefix_len == 0)
825                 rt = rt6_get_dflt_router(gwaddr, dev);
826         else
827                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
828                                         gwaddr, dev);
829
830         if (rt && !lifetime) {
831                 ip6_del_rt(rt);
832                 rt = NULL;
833         }
834
835         if (!rt && lifetime)
836                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
837                                         dev, pref);
838         else if (rt)
839                 rt->rt6i_flags = RTF_ROUTEINFO |
840                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
841
842         if (rt) {
843                 if (!addrconf_finite_timeout(lifetime))
844                         rt6_clean_expires(rt);
845                 else
846                         rt6_set_expires(rt, jiffies + HZ * lifetime);
847
848                 ip6_rt_put(rt);
849         }
850         return 0;
851 }
852 #endif
853
854 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
855                                         struct in6_addr *saddr)
856 {
857         struct fib6_node *pn;
858         while (1) {
859                 if (fn->fn_flags & RTN_TL_ROOT)
860                         return NULL;
861                 pn = fn->parent;
862                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
863                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
864                 else
865                         fn = pn;
866                 if (fn->fn_flags & RTN_RTINFO)
867                         return fn;
868         }
869 }
870
871 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
872                                              struct fib6_table *table,
873                                              struct flowi6 *fl6, int flags)
874 {
875         struct fib6_node *fn;
876         struct rt6_info *rt;
877
878         read_lock_bh(&table->tb6_lock);
879         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
880 restart:
881         rt = fn->leaf;
882         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
883         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
884                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
885         if (rt == net->ipv6.ip6_null_entry) {
886                 fn = fib6_backtrack(fn, &fl6->saddr);
887                 if (fn)
888                         goto restart;
889         }
890         dst_use(&rt->dst, jiffies);
891         read_unlock_bh(&table->tb6_lock);
892
893         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
894
895         return rt;
896
897 }
898
899 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
900                                     int flags)
901 {
902         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
903 }
904 EXPORT_SYMBOL_GPL(ip6_route_lookup);
905
906 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
907                             const struct in6_addr *saddr, int oif, int strict)
908 {
909         struct flowi6 fl6 = {
910                 .flowi6_oif = oif,
911                 .daddr = *daddr,
912         };
913         struct dst_entry *dst;
914         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
915
916         if (saddr) {
917                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
918                 flags |= RT6_LOOKUP_F_HAS_SADDR;
919         }
920
921         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
922         if (dst->error == 0)
923                 return (struct rt6_info *) dst;
924
925         dst_release(dst);
926
927         return NULL;
928 }
929 EXPORT_SYMBOL(rt6_lookup);
930
931 /* ip6_ins_rt is called with FREE table->tb6_lock.
932  * It takes new route entry, the addition fails by any reason the
933  * route is released.
934  * Caller must hold dst before calling it.
935  */
936
937 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
938                         struct mx6_config *mxc,
939                         struct netlink_ext_ack *extack)
940 {
941         int err;
942         struct fib6_table *table;
943
944         table = rt->rt6i_table;
945         write_lock_bh(&table->tb6_lock);
946         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
947         write_unlock_bh(&table->tb6_lock);
948
949         return err;
950 }
951
952 int ip6_ins_rt(struct rt6_info *rt)
953 {
954         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
955         struct mx6_config mxc = { .mx = NULL, };
956
957         /* Hold dst to account for the reference from the fib6 tree */
958         dst_hold(&rt->dst);
959         return __ip6_ins_rt(rt, &info, &mxc, NULL);
960 }
961
962 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
963                                            const struct in6_addr *daddr,
964                                            const struct in6_addr *saddr)
965 {
966         struct rt6_info *rt;
967
968         /*
969          *      Clone the route.
970          */
971
972         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
973                 ort = (struct rt6_info *)ort->dst.from;
974
975         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
976
977         if (!rt)
978                 return NULL;
979
980         ip6_rt_copy_init(rt, ort);
981         rt->rt6i_flags |= RTF_CACHE;
982         rt->rt6i_metric = 0;
983         rt->dst.flags |= DST_HOST;
984         rt->rt6i_dst.addr = *daddr;
985         rt->rt6i_dst.plen = 128;
986
987         if (!rt6_is_gw_or_nonexthop(ort)) {
988                 if (ort->rt6i_dst.plen != 128 &&
989                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
990                         rt->rt6i_flags |= RTF_ANYCAST;
991 #ifdef CONFIG_IPV6_SUBTREES
992                 if (rt->rt6i_src.plen && saddr) {
993                         rt->rt6i_src.addr = *saddr;
994                         rt->rt6i_src.plen = 128;
995                 }
996 #endif
997         }
998
999         return rt;
1000 }
1001
1002 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1003 {
1004         struct rt6_info *pcpu_rt;
1005
1006         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1007                                   rt->dst.dev, rt->dst.flags);
1008
1009         if (!pcpu_rt)
1010                 return NULL;
1011         ip6_rt_copy_init(pcpu_rt, rt);
1012         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1013         pcpu_rt->rt6i_flags |= RTF_PCPU;
1014         return pcpu_rt;
1015 }
1016
1017 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1018 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1019 {
1020         struct rt6_info *pcpu_rt, **p;
1021
1022         p = this_cpu_ptr(rt->rt6i_pcpu);
1023         pcpu_rt = *p;
1024
1025         if (pcpu_rt) {
1026                 dst_hold(&pcpu_rt->dst);
1027                 rt6_dst_from_metrics_check(pcpu_rt);
1028         }
1029         return pcpu_rt;
1030 }
1031
1032 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1033 {
1034         struct fib6_table *table = rt->rt6i_table;
1035         struct rt6_info *pcpu_rt, *prev, **p;
1036
1037         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1038         if (!pcpu_rt) {
1039                 struct net *net = dev_net(rt->dst.dev);
1040
1041                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1042                 return net->ipv6.ip6_null_entry;
1043         }
1044
1045         read_lock_bh(&table->tb6_lock);
1046         if (rt->rt6i_pcpu) {
1047                 p = this_cpu_ptr(rt->rt6i_pcpu);
1048                 prev = cmpxchg(p, NULL, pcpu_rt);
1049                 if (prev) {
1050                         /* If someone did it before us, return prev instead */
1051                         dst_release_immediate(&pcpu_rt->dst);
1052                         pcpu_rt = prev;
1053                 }
1054         } else {
1055                 /* rt has been removed from the fib6 tree
1056                  * before we have a chance to acquire the read_lock.
1057                  * In this case, don't brother to create a pcpu rt
1058                  * since rt is going away anyway.  The next
1059                  * dst_check() will trigger a re-lookup.
1060                  */
1061                 dst_release_immediate(&pcpu_rt->dst);
1062                 pcpu_rt = rt;
1063         }
1064         dst_hold(&pcpu_rt->dst);
1065         rt6_dst_from_metrics_check(pcpu_rt);
1066         read_unlock_bh(&table->tb6_lock);
1067         return pcpu_rt;
1068 }
1069
1070 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1071                                int oif, struct flowi6 *fl6, int flags)
1072 {
1073         struct fib6_node *fn, *saved_fn;
1074         struct rt6_info *rt;
1075         int strict = 0;
1076
1077         strict |= flags & RT6_LOOKUP_F_IFACE;
1078         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1079         if (net->ipv6.devconf_all->forwarding == 0)
1080                 strict |= RT6_LOOKUP_F_REACHABLE;
1081
1082         read_lock_bh(&table->tb6_lock);
1083
1084         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1085         saved_fn = fn;
1086
1087         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1088                 oif = 0;
1089
1090 redo_rt6_select:
1091         rt = rt6_select(fn, oif, strict);
1092         if (rt->rt6i_nsiblings)
1093                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1094         if (rt == net->ipv6.ip6_null_entry) {
1095                 fn = fib6_backtrack(fn, &fl6->saddr);
1096                 if (fn)
1097                         goto redo_rt6_select;
1098                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1099                         /* also consider unreachable route */
1100                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1101                         fn = saved_fn;
1102                         goto redo_rt6_select;
1103                 }
1104         }
1105
1106
1107         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1108                 dst_use(&rt->dst, jiffies);
1109                 read_unlock_bh(&table->tb6_lock);
1110
1111                 rt6_dst_from_metrics_check(rt);
1112
1113                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1114                 return rt;
1115         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1116                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1117                 /* Create a RTF_CACHE clone which will not be
1118                  * owned by the fib6 tree.  It is for the special case where
1119                  * the daddr in the skb during the neighbor look-up is different
1120                  * from the fl6->daddr used to look-up route here.
1121                  */
1122
1123                 struct rt6_info *uncached_rt;
1124
1125                 dst_use(&rt->dst, jiffies);
1126                 read_unlock_bh(&table->tb6_lock);
1127
1128                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1129                 dst_release(&rt->dst);
1130
1131                 if (uncached_rt) {
1132                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1133                          * No need for another dst_hold()
1134                          */
1135                         rt6_uncached_list_add(uncached_rt);
1136                 } else {
1137                         uncached_rt = net->ipv6.ip6_null_entry;
1138                         dst_hold(&uncached_rt->dst);
1139                 }
1140
1141                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1142                 return uncached_rt;
1143
1144         } else {
1145                 /* Get a percpu copy */
1146
1147                 struct rt6_info *pcpu_rt;
1148
1149                 rt->dst.lastuse = jiffies;
1150                 rt->dst.__use++;
1151                 pcpu_rt = rt6_get_pcpu_route(rt);
1152
1153                 if (pcpu_rt) {
1154                         read_unlock_bh(&table->tb6_lock);
1155                 } else {
1156                         /* We have to do the read_unlock first
1157                          * because rt6_make_pcpu_route() may trigger
1158                          * ip6_dst_gc() which will take the write_lock.
1159                          */
1160                         dst_hold(&rt->dst);
1161                         read_unlock_bh(&table->tb6_lock);
1162                         pcpu_rt = rt6_make_pcpu_route(rt);
1163                         dst_release(&rt->dst);
1164                 }
1165
1166                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1167                 return pcpu_rt;
1168
1169         }
1170 }
1171 EXPORT_SYMBOL_GPL(ip6_pol_route);
1172
1173 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1174                                             struct flowi6 *fl6, int flags)
1175 {
1176         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1177 }
1178
1179 struct dst_entry *ip6_route_input_lookup(struct net *net,
1180                                          struct net_device *dev,
1181                                          struct flowi6 *fl6, int flags)
1182 {
1183         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1184                 flags |= RT6_LOOKUP_F_IFACE;
1185
1186         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1187 }
1188 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1189
1190 void ip6_route_input(struct sk_buff *skb)
1191 {
1192         const struct ipv6hdr *iph = ipv6_hdr(skb);
1193         struct net *net = dev_net(skb->dev);
1194         int flags = RT6_LOOKUP_F_HAS_SADDR;
1195         struct ip_tunnel_info *tun_info;
1196         struct flowi6 fl6 = {
1197                 .flowi6_iif = skb->dev->ifindex,
1198                 .daddr = iph->daddr,
1199                 .saddr = iph->saddr,
1200                 .flowlabel = ip6_flowinfo(iph),
1201                 .flowi6_mark = skb->mark,
1202                 .flowi6_proto = iph->nexthdr,
1203         };
1204
1205         tun_info = skb_tunnel_info(skb);
1206         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1207                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1208         skb_dst_drop(skb);
1209         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1210 }
1211
1212 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1213                                              struct flowi6 *fl6, int flags)
1214 {
1215         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1216 }
1217
1218 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1219                                          struct flowi6 *fl6, int flags)
1220 {
1221         bool any_src;
1222
1223         if (rt6_need_strict(&fl6->daddr)) {
1224                 struct dst_entry *dst;
1225
1226                 dst = l3mdev_link_scope_lookup(net, fl6);
1227                 if (dst)
1228                         return dst;
1229         }
1230
1231         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1232
1233         any_src = ipv6_addr_any(&fl6->saddr);
1234         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1235             (fl6->flowi6_oif && any_src))
1236                 flags |= RT6_LOOKUP_F_IFACE;
1237
1238         if (!any_src)
1239                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1240         else if (sk)
1241                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1242
1243         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1244 }
1245 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1246
1247 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1248 {
1249         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1250         struct net_device *loopback_dev = net->loopback_dev;
1251         struct dst_entry *new = NULL;
1252
1253         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1254                        DST_OBSOLETE_NONE, 0);
1255         if (rt) {
1256                 rt6_info_init(rt);
1257
1258                 new = &rt->dst;
1259                 new->__use = 1;
1260                 new->input = dst_discard;
1261                 new->output = dst_discard_out;
1262
1263                 dst_copy_metrics(new, &ort->dst);
1264
1265                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1266                 rt->rt6i_gateway = ort->rt6i_gateway;
1267                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1268                 rt->rt6i_metric = 0;
1269
1270                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1271 #ifdef CONFIG_IPV6_SUBTREES
1272                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1273 #endif
1274         }
1275
1276         dst_release(dst_orig);
1277         return new ? new : ERR_PTR(-ENOMEM);
1278 }
1279
1280 /*
1281  *      Destination cache support functions
1282  */
1283
1284 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1285 {
1286         if (rt->dst.from &&
1287             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1288                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1289 }
1290
1291 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1292 {
1293         u32 rt_cookie = 0;
1294
1295         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1296                 return NULL;
1297
1298         if (rt6_check_expired(rt))
1299                 return NULL;
1300
1301         return &rt->dst;
1302 }
1303
1304 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1305 {
1306         if (!__rt6_check_expired(rt) &&
1307             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1308             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1309                 return &rt->dst;
1310         else
1311                 return NULL;
1312 }
1313
1314 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1315 {
1316         struct rt6_info *rt;
1317
1318         rt = (struct rt6_info *) dst;
1319
1320         /* All IPV6 dsts are created with ->obsolete set to the value
1321          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1322          * into this function always.
1323          */
1324
1325         rt6_dst_from_metrics_check(rt);
1326
1327         if (rt->rt6i_flags & RTF_PCPU ||
1328             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1329                 return rt6_dst_from_check(rt, cookie);
1330         else
1331                 return rt6_check(rt, cookie);
1332 }
1333
1334 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1335 {
1336         struct rt6_info *rt = (struct rt6_info *) dst;
1337
1338         if (rt) {
1339                 if (rt->rt6i_flags & RTF_CACHE) {
1340                         if (rt6_check_expired(rt)) {
1341                                 ip6_del_rt(rt);
1342                                 dst = NULL;
1343                         }
1344                 } else {
1345                         dst_release(dst);
1346                         dst = NULL;
1347                 }
1348         }
1349         return dst;
1350 }
1351
1352 static void ip6_link_failure(struct sk_buff *skb)
1353 {
1354         struct rt6_info *rt;
1355
1356         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1357
1358         rt = (struct rt6_info *) skb_dst(skb);
1359         if (rt) {
1360                 if (rt->rt6i_flags & RTF_CACHE) {
1361                         if (dst_hold_safe(&rt->dst))
1362                                 ip6_del_rt(rt);
1363                 } else {
1364                         struct fib6_node *fn;
1365
1366                         rcu_read_lock();
1367                         fn = rcu_dereference(rt->rt6i_node);
1368                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1369                                 fn->fn_sernum = -1;
1370                         rcu_read_unlock();
1371                 }
1372         }
1373 }
1374
1375 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1376 {
1377         struct net *net = dev_net(rt->dst.dev);
1378
1379         rt->rt6i_flags |= RTF_MODIFIED;
1380         rt->rt6i_pmtu = mtu;
1381         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1382 }
1383
1384 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1385 {
1386         return !(rt->rt6i_flags & RTF_CACHE) &&
1387                 (rt->rt6i_flags & RTF_PCPU ||
1388                  rcu_access_pointer(rt->rt6i_node));
1389 }
1390
1391 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1392                                  const struct ipv6hdr *iph, u32 mtu)
1393 {
1394         const struct in6_addr *daddr, *saddr;
1395         struct rt6_info *rt6 = (struct rt6_info *)dst;
1396
1397         if (rt6->rt6i_flags & RTF_LOCAL)
1398                 return;
1399
1400         if (dst_metric_locked(dst, RTAX_MTU))
1401                 return;
1402
1403         if (iph) {
1404                 daddr = &iph->daddr;
1405                 saddr = &iph->saddr;
1406         } else if (sk) {
1407                 daddr = &sk->sk_v6_daddr;
1408                 saddr = &inet6_sk(sk)->saddr;
1409         } else {
1410                 daddr = NULL;
1411                 saddr = NULL;
1412         }
1413         dst_confirm_neigh(dst, daddr);
1414         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1415         if (mtu >= dst_mtu(dst))
1416                 return;
1417
1418         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1419                 rt6_do_update_pmtu(rt6, mtu);
1420         } else if (daddr) {
1421                 struct rt6_info *nrt6;
1422
1423                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1424                 if (nrt6) {
1425                         rt6_do_update_pmtu(nrt6, mtu);
1426
1427                         /* ip6_ins_rt(nrt6) will bump the
1428                          * rt6->rt6i_node->fn_sernum
1429                          * which will fail the next rt6_check() and
1430                          * invalidate the sk->sk_dst_cache.
1431                          */
1432                         ip6_ins_rt(nrt6);
1433                         /* Release the reference taken in
1434                          * ip6_rt_cache_alloc()
1435                          */
1436                         dst_release(&nrt6->dst);
1437                 }
1438         }
1439 }
1440
1441 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1442                                struct sk_buff *skb, u32 mtu)
1443 {
1444         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1445 }
1446
1447 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1448                      int oif, u32 mark, kuid_t uid)
1449 {
1450         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1451         struct dst_entry *dst;
1452         struct flowi6 fl6;
1453
1454         memset(&fl6, 0, sizeof(fl6));
1455         fl6.flowi6_oif = oif;
1456         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1457         fl6.daddr = iph->daddr;
1458         fl6.saddr = iph->saddr;
1459         fl6.flowlabel = ip6_flowinfo(iph);
1460         fl6.flowi6_uid = uid;
1461
1462         dst = ip6_route_output(net, NULL, &fl6);
1463         if (!dst->error)
1464                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1465         dst_release(dst);
1466 }
1467 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1468
1469 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1470 {
1471         struct dst_entry *dst;
1472
1473         ip6_update_pmtu(skb, sock_net(sk), mtu,
1474                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1475
1476         dst = __sk_dst_get(sk);
1477         if (!dst || !dst->obsolete ||
1478             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1479                 return;
1480
1481         bh_lock_sock(sk);
1482         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1483                 ip6_datagram_dst_update(sk, false);
1484         bh_unlock_sock(sk);
1485 }
1486 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1487
1488 /* Handle redirects */
1489 struct ip6rd_flowi {
1490         struct flowi6 fl6;
1491         struct in6_addr gateway;
1492 };
1493
1494 static struct rt6_info *__ip6_route_redirect(struct net *net,
1495                                              struct fib6_table *table,
1496                                              struct flowi6 *fl6,
1497                                              int flags)
1498 {
1499         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1500         struct rt6_info *rt;
1501         struct fib6_node *fn;
1502
1503         /* Get the "current" route for this destination and
1504          * check if the redirect has come from appropriate router.
1505          *
1506          * RFC 4861 specifies that redirects should only be
1507          * accepted if they come from the nexthop to the target.
1508          * Due to the way the routes are chosen, this notion
1509          * is a bit fuzzy and one might need to check all possible
1510          * routes.
1511          */
1512
1513         read_lock_bh(&table->tb6_lock);
1514         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1515 restart:
1516         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1517                 if (rt6_check_expired(rt))
1518                         continue;
1519                 if (rt->dst.error)
1520                         break;
1521                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1522                         continue;
1523                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1524                         continue;
1525                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1526                         continue;
1527                 break;
1528         }
1529
1530         if (!rt)
1531                 rt = net->ipv6.ip6_null_entry;
1532         else if (rt->dst.error) {
1533                 rt = net->ipv6.ip6_null_entry;
1534                 goto out;
1535         }
1536
1537         if (rt == net->ipv6.ip6_null_entry) {
1538                 fn = fib6_backtrack(fn, &fl6->saddr);
1539                 if (fn)
1540                         goto restart;
1541         }
1542
1543 out:
1544         dst_hold(&rt->dst);
1545
1546         read_unlock_bh(&table->tb6_lock);
1547
1548         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1549         return rt;
1550 };
1551
1552 static struct dst_entry *ip6_route_redirect(struct net *net,
1553                                         const struct flowi6 *fl6,
1554                                         const struct in6_addr *gateway)
1555 {
1556         int flags = RT6_LOOKUP_F_HAS_SADDR;
1557         struct ip6rd_flowi rdfl;
1558
1559         rdfl.fl6 = *fl6;
1560         rdfl.gateway = *gateway;
1561
1562         return fib6_rule_lookup(net, &rdfl.fl6,
1563                                 flags, __ip6_route_redirect);
1564 }
1565
1566 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1567                   kuid_t uid)
1568 {
1569         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1570         struct dst_entry *dst;
1571         struct flowi6 fl6;
1572
1573         memset(&fl6, 0, sizeof(fl6));
1574         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1575         fl6.flowi6_oif = oif;
1576         fl6.flowi6_mark = mark;
1577         fl6.daddr = iph->daddr;
1578         fl6.saddr = iph->saddr;
1579         fl6.flowlabel = ip6_flowinfo(iph);
1580         fl6.flowi6_uid = uid;
1581
1582         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1583         rt6_do_redirect(dst, NULL, skb);
1584         dst_release(dst);
1585 }
1586 EXPORT_SYMBOL_GPL(ip6_redirect);
1587
1588 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1589                             u32 mark)
1590 {
1591         const struct ipv6hdr *iph = ipv6_hdr(skb);
1592         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1593         struct dst_entry *dst;
1594         struct flowi6 fl6;
1595
1596         memset(&fl6, 0, sizeof(fl6));
1597         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1598         fl6.flowi6_oif = oif;
1599         fl6.flowi6_mark = mark;
1600         fl6.daddr = msg->dest;
1601         fl6.saddr = iph->daddr;
1602         fl6.flowi6_uid = sock_net_uid(net, NULL);
1603
1604         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1605         rt6_do_redirect(dst, NULL, skb);
1606         dst_release(dst);
1607 }
1608
1609 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1610 {
1611         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1612                      sk->sk_uid);
1613 }
1614 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1615
1616 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1617 {
1618         struct net_device *dev = dst->dev;
1619         unsigned int mtu = dst_mtu(dst);
1620         struct net *net = dev_net(dev);
1621
1622         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1623
1624         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1625                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1626
1627         /*
1628          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1629          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1630          * IPV6_MAXPLEN is also valid and means: "any MSS,
1631          * rely only on pmtu discovery"
1632          */
1633         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1634                 mtu = IPV6_MAXPLEN;
1635         return mtu;
1636 }
1637
1638 static unsigned int ip6_mtu(const struct dst_entry *dst)
1639 {
1640         const struct rt6_info *rt = (const struct rt6_info *)dst;
1641         unsigned int mtu = rt->rt6i_pmtu;
1642         struct inet6_dev *idev;
1643
1644         if (mtu)
1645                 goto out;
1646
1647         mtu = dst_metric_raw(dst, RTAX_MTU);
1648         if (mtu)
1649                 goto out;
1650
1651         mtu = IPV6_MIN_MTU;
1652
1653         rcu_read_lock();
1654         idev = __in6_dev_get(dst->dev);
1655         if (idev)
1656                 mtu = idev->cnf.mtu6;
1657         rcu_read_unlock();
1658
1659 out:
1660         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1661
1662         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1663 }
1664
1665 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1666                                   struct flowi6 *fl6)
1667 {
1668         struct dst_entry *dst;
1669         struct rt6_info *rt;
1670         struct inet6_dev *idev = in6_dev_get(dev);
1671         struct net *net = dev_net(dev);
1672
1673         if (unlikely(!idev))
1674                 return ERR_PTR(-ENODEV);
1675
1676         rt = ip6_dst_alloc(net, dev, 0);
1677         if (unlikely(!rt)) {
1678                 in6_dev_put(idev);
1679                 dst = ERR_PTR(-ENOMEM);
1680                 goto out;
1681         }
1682
1683         rt->dst.flags |= DST_HOST;
1684         rt->dst.output  = ip6_output;
1685         rt->rt6i_gateway  = fl6->daddr;
1686         rt->rt6i_dst.addr = fl6->daddr;
1687         rt->rt6i_dst.plen = 128;
1688         rt->rt6i_idev     = idev;
1689         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1690
1691         /* Add this dst into uncached_list so that rt6_ifdown() can
1692          * do proper release of the net_device
1693          */
1694         rt6_uncached_list_add(rt);
1695
1696         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1697
1698 out:
1699         return dst;
1700 }
1701
1702 static int ip6_dst_gc(struct dst_ops *ops)
1703 {
1704         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1705         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1706         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1707         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1708         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1709         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1710         int entries;
1711
1712         entries = dst_entries_get_fast(ops);
1713         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1714             entries <= rt_max_size)
1715                 goto out;
1716
1717         net->ipv6.ip6_rt_gc_expire++;
1718         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1719         entries = dst_entries_get_slow(ops);
1720         if (entries < ops->gc_thresh)
1721                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1722 out:
1723         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1724         return entries > rt_max_size;
1725 }
1726
1727 static int ip6_convert_metrics(struct mx6_config *mxc,
1728                                const struct fib6_config *cfg)
1729 {
1730         bool ecn_ca = false;
1731         struct nlattr *nla;
1732         int remaining;
1733         u32 *mp;
1734
1735         if (!cfg->fc_mx)
1736                 return 0;
1737
1738         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1739         if (unlikely(!mp))
1740                 return -ENOMEM;
1741
1742         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1743                 int type = nla_type(nla);
1744                 u32 val;
1745
1746                 if (!type)
1747                         continue;
1748                 if (unlikely(type > RTAX_MAX))
1749                         goto err;
1750
1751                 if (type == RTAX_CC_ALGO) {
1752                         char tmp[TCP_CA_NAME_MAX];
1753
1754                         nla_strlcpy(tmp, nla, sizeof(tmp));
1755                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1756                         if (val == TCP_CA_UNSPEC)
1757                                 goto err;
1758                 } else {
1759                         val = nla_get_u32(nla);
1760                 }
1761                 if (type == RTAX_HOPLIMIT && val > 255)
1762                         val = 255;
1763                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1764                         goto err;
1765
1766                 mp[type - 1] = val;
1767                 __set_bit(type - 1, mxc->mx_valid);
1768         }
1769
1770         if (ecn_ca) {
1771                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1772                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1773         }
1774
1775         mxc->mx = mp;
1776         return 0;
1777  err:
1778         kfree(mp);
1779         return -EINVAL;
1780 }
1781
1782 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1783                                             struct fib6_config *cfg,
1784                                             const struct in6_addr *gw_addr)
1785 {
1786         struct flowi6 fl6 = {
1787                 .flowi6_oif = cfg->fc_ifindex,
1788                 .daddr = *gw_addr,
1789                 .saddr = cfg->fc_prefsrc,
1790         };
1791         struct fib6_table *table;
1792         struct rt6_info *rt;
1793         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1794
1795         table = fib6_get_table(net, cfg->fc_table);
1796         if (!table)
1797                 return NULL;
1798
1799         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1800                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1801
1802         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1803
1804         /* if table lookup failed, fall back to full lookup */
1805         if (rt == net->ipv6.ip6_null_entry) {
1806                 ip6_rt_put(rt);
1807                 rt = NULL;
1808         }
1809
1810         return rt;
1811 }
1812
1813 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1814                                               struct netlink_ext_ack *extack)
1815 {
1816         struct net *net = cfg->fc_nlinfo.nl_net;
1817         struct rt6_info *rt = NULL;
1818         struct net_device *dev = NULL;
1819         struct inet6_dev *idev = NULL;
1820         struct fib6_table *table;
1821         int addr_type;
1822         int err = -EINVAL;
1823
1824         /* RTF_PCPU is an internal flag; can not be set by userspace */
1825         if (cfg->fc_flags & RTF_PCPU) {
1826                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1827                 goto out;
1828         }
1829
1830         if (cfg->fc_dst_len > 128) {
1831                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1832                 goto out;
1833         }
1834         if (cfg->fc_src_len > 128) {
1835                 NL_SET_ERR_MSG(extack, "Invalid source address length");
1836                 goto out;
1837         }
1838 #ifndef CONFIG_IPV6_SUBTREES
1839         if (cfg->fc_src_len) {
1840                 NL_SET_ERR_MSG(extack,
1841                                "Specifying source address requires IPV6_SUBTREES to be enabled");
1842                 goto out;
1843         }
1844 #endif
1845         if (cfg->fc_ifindex) {
1846                 err = -ENODEV;
1847                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1848                 if (!dev)
1849                         goto out;
1850                 idev = in6_dev_get(dev);
1851                 if (!idev)
1852                         goto out;
1853         }
1854
1855         if (cfg->fc_metric == 0)
1856                 cfg->fc_metric = IP6_RT_PRIO_USER;
1857
1858         err = -ENOBUFS;
1859         if (cfg->fc_nlinfo.nlh &&
1860             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1861                 table = fib6_get_table(net, cfg->fc_table);
1862                 if (!table) {
1863                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1864                         table = fib6_new_table(net, cfg->fc_table);
1865                 }
1866         } else {
1867                 table = fib6_new_table(net, cfg->fc_table);
1868         }
1869
1870         if (!table)
1871                 goto out;
1872
1873         rt = ip6_dst_alloc(net, NULL,
1874                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1875
1876         if (!rt) {
1877                 err = -ENOMEM;
1878                 goto out;
1879         }
1880
1881         if (cfg->fc_flags & RTF_EXPIRES)
1882                 rt6_set_expires(rt, jiffies +
1883                                 clock_t_to_jiffies(cfg->fc_expires));
1884         else
1885                 rt6_clean_expires(rt);
1886
1887         if (cfg->fc_protocol == RTPROT_UNSPEC)
1888                 cfg->fc_protocol = RTPROT_BOOT;
1889         rt->rt6i_protocol = cfg->fc_protocol;
1890
1891         addr_type = ipv6_addr_type(&cfg->fc_dst);
1892
1893         if (addr_type & IPV6_ADDR_MULTICAST)
1894                 rt->dst.input = ip6_mc_input;
1895         else if (cfg->fc_flags & RTF_LOCAL)
1896                 rt->dst.input = ip6_input;
1897         else
1898                 rt->dst.input = ip6_forward;
1899
1900         rt->dst.output = ip6_output;
1901
1902         if (cfg->fc_encap) {
1903                 struct lwtunnel_state *lwtstate;
1904
1905                 err = lwtunnel_build_state(cfg->fc_encap_type,
1906                                            cfg->fc_encap, AF_INET6, cfg,
1907                                            &lwtstate, extack);
1908                 if (err)
1909                         goto out;
1910                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1911                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1912                         rt->dst.lwtstate->orig_output = rt->dst.output;
1913                         rt->dst.output = lwtunnel_output;
1914                 }
1915                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1916                         rt->dst.lwtstate->orig_input = rt->dst.input;
1917                         rt->dst.input = lwtunnel_input;
1918                 }
1919         }
1920
1921         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1922         rt->rt6i_dst.plen = cfg->fc_dst_len;
1923         if (rt->rt6i_dst.plen == 128)
1924                 rt->dst.flags |= DST_HOST;
1925
1926 #ifdef CONFIG_IPV6_SUBTREES
1927         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1928         rt->rt6i_src.plen = cfg->fc_src_len;
1929 #endif
1930
1931         rt->rt6i_metric = cfg->fc_metric;
1932
1933         /* We cannot add true routes via loopback here,
1934            they would result in kernel looping; promote them to reject routes
1935          */
1936         if ((cfg->fc_flags & RTF_REJECT) ||
1937             (dev && (dev->flags & IFF_LOOPBACK) &&
1938              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1939              !(cfg->fc_flags & RTF_LOCAL))) {
1940                 /* hold loopback dev/idev if we haven't done so. */
1941                 if (dev != net->loopback_dev) {
1942                         if (dev) {
1943                                 dev_put(dev);
1944                                 in6_dev_put(idev);
1945                         }
1946                         dev = net->loopback_dev;
1947                         dev_hold(dev);
1948                         idev = in6_dev_get(dev);
1949                         if (!idev) {
1950                                 err = -ENODEV;
1951                                 goto out;
1952                         }
1953                 }
1954                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1955                 switch (cfg->fc_type) {
1956                 case RTN_BLACKHOLE:
1957                         rt->dst.error = -EINVAL;
1958                         rt->dst.output = dst_discard_out;
1959                         rt->dst.input = dst_discard;
1960                         break;
1961                 case RTN_PROHIBIT:
1962                         rt->dst.error = -EACCES;
1963                         rt->dst.output = ip6_pkt_prohibit_out;
1964                         rt->dst.input = ip6_pkt_prohibit;
1965                         break;
1966                 case RTN_THROW:
1967                 case RTN_UNREACHABLE:
1968                 default:
1969                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1970                                         : (cfg->fc_type == RTN_UNREACHABLE)
1971                                         ? -EHOSTUNREACH : -ENETUNREACH;
1972                         rt->dst.output = ip6_pkt_discard_out;
1973                         rt->dst.input = ip6_pkt_discard;
1974                         break;
1975                 }
1976                 goto install_route;
1977         }
1978
1979         if (cfg->fc_flags & RTF_GATEWAY) {
1980                 const struct in6_addr *gw_addr;
1981                 int gwa_type;
1982
1983                 gw_addr = &cfg->fc_gateway;
1984                 gwa_type = ipv6_addr_type(gw_addr);
1985
1986                 /* if gw_addr is local we will fail to detect this in case
1987                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1988                  * will return already-added prefix route via interface that
1989                  * prefix route was assigned to, which might be non-loopback.
1990                  */
1991                 err = -EINVAL;
1992                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1993                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1994                                             dev : NULL, 0, 0)) {
1995                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
1996                         goto out;
1997                 }
1998                 rt->rt6i_gateway = *gw_addr;
1999
2000                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2001                         struct rt6_info *grt = NULL;
2002
2003                         /* IPv6 strictly inhibits using not link-local
2004                            addresses as nexthop address.
2005                            Otherwise, router will not able to send redirects.
2006                            It is very good, but in some (rare!) circumstances
2007                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2008                            some exceptions. --ANK
2009                            We allow IPv4-mapped nexthops to support RFC4798-type
2010                            addressing
2011                          */
2012                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2013                                           IPV6_ADDR_MAPPED))) {
2014                                 NL_SET_ERR_MSG(extack,
2015                                                "Invalid gateway address");
2016                                 goto out;
2017                         }
2018
2019                         if (cfg->fc_table) {
2020                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2021
2022                                 if (grt) {
2023                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2024                                             (dev && dev != grt->dst.dev)) {
2025                                                 ip6_rt_put(grt);
2026                                                 grt = NULL;
2027                                         }
2028                                 }
2029                         }
2030
2031                         if (!grt)
2032                                 grt = rt6_lookup(net, gw_addr, NULL,
2033                                                  cfg->fc_ifindex, 1);
2034
2035                         err = -EHOSTUNREACH;
2036                         if (!grt)
2037                                 goto out;
2038                         if (dev) {
2039                                 if (dev != grt->dst.dev) {
2040                                         ip6_rt_put(grt);
2041                                         goto out;
2042                                 }
2043                         } else {
2044                                 dev = grt->dst.dev;
2045                                 idev = grt->rt6i_idev;
2046                                 dev_hold(dev);
2047                                 in6_dev_hold(grt->rt6i_idev);
2048                         }
2049                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2050                                 err = 0;
2051                         ip6_rt_put(grt);
2052
2053                         if (err)
2054                                 goto out;
2055                 }
2056                 err = -EINVAL;
2057                 if (!dev) {
2058                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2059                         goto out;
2060                 } else if (dev->flags & IFF_LOOPBACK) {
2061                         NL_SET_ERR_MSG(extack,
2062                                        "Egress device can not be loopback device for this route");
2063                         goto out;
2064                 }
2065         }
2066
2067         err = -ENODEV;
2068         if (!dev)
2069                 goto out;
2070
2071         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2072                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2073                         NL_SET_ERR_MSG(extack, "Invalid source address");
2074                         err = -EINVAL;
2075                         goto out;
2076                 }
2077                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2078                 rt->rt6i_prefsrc.plen = 128;
2079         } else
2080                 rt->rt6i_prefsrc.plen = 0;
2081
2082         rt->rt6i_flags = cfg->fc_flags;
2083
2084 install_route:
2085         rt->dst.dev = dev;
2086         rt->rt6i_idev = idev;
2087         rt->rt6i_table = table;
2088
2089         cfg->fc_nlinfo.nl_net = dev_net(dev);
2090
2091         return rt;
2092 out:
2093         if (dev)
2094                 dev_put(dev);
2095         if (idev)
2096                 in6_dev_put(idev);
2097         if (rt)
2098                 dst_release_immediate(&rt->dst);
2099
2100         return ERR_PTR(err);
2101 }
2102
2103 int ip6_route_add(struct fib6_config *cfg,
2104                   struct netlink_ext_ack *extack)
2105 {
2106         struct mx6_config mxc = { .mx = NULL, };
2107         struct rt6_info *rt;
2108         int err;
2109
2110         rt = ip6_route_info_create(cfg, extack);
2111         if (IS_ERR(rt)) {
2112                 err = PTR_ERR(rt);
2113                 rt = NULL;
2114                 goto out;
2115         }
2116
2117         err = ip6_convert_metrics(&mxc, cfg);
2118         if (err)
2119                 goto out;
2120
2121         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2122
2123         kfree(mxc.mx);
2124
2125         return err;
2126 out:
2127         if (rt)
2128                 dst_release_immediate(&rt->dst);
2129
2130         return err;
2131 }
2132
2133 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2134 {
2135         int err;
2136         struct fib6_table *table;
2137         struct net *net = dev_net(rt->dst.dev);
2138
2139         if (rt == net->ipv6.ip6_null_entry) {
2140                 err = -ENOENT;
2141                 goto out;
2142         }
2143
2144         table = rt->rt6i_table;
2145         write_lock_bh(&table->tb6_lock);
2146         err = fib6_del(rt, info);
2147         write_unlock_bh(&table->tb6_lock);
2148
2149 out:
2150         ip6_rt_put(rt);
2151         return err;
2152 }
2153
2154 int ip6_del_rt(struct rt6_info *rt)
2155 {
2156         struct nl_info info = {
2157                 .nl_net = dev_net(rt->dst.dev),
2158         };
2159         return __ip6_del_rt(rt, &info);
2160 }
2161
2162 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2163 {
2164         struct nl_info *info = &cfg->fc_nlinfo;
2165         struct net *net = info->nl_net;
2166         struct sk_buff *skb = NULL;
2167         struct fib6_table *table;
2168         int err = -ENOENT;
2169
2170         if (rt == net->ipv6.ip6_null_entry)
2171                 goto out_put;
2172         table = rt->rt6i_table;
2173         write_lock_bh(&table->tb6_lock);
2174
2175         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2176                 struct rt6_info *sibling, *next_sibling;
2177
2178                 /* prefer to send a single notification with all hops */
2179                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2180                 if (skb) {
2181                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2182
2183                         if (rt6_fill_node(net, skb, rt,
2184                                           NULL, NULL, 0, RTM_DELROUTE,
2185                                           info->portid, seq, 0) < 0) {
2186                                 kfree_skb(skb);
2187                                 skb = NULL;
2188                         } else
2189                                 info->skip_notify = 1;
2190                 }
2191
2192                 list_for_each_entry_safe(sibling, next_sibling,
2193                                          &rt->rt6i_siblings,
2194                                          rt6i_siblings) {
2195                         err = fib6_del(sibling, info);
2196                         if (err)
2197                                 goto out_unlock;
2198                 }
2199         }
2200
2201         err = fib6_del(rt, info);
2202 out_unlock:
2203         write_unlock_bh(&table->tb6_lock);
2204 out_put:
2205         ip6_rt_put(rt);
2206
2207         if (skb) {
2208                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2209                             info->nlh, gfp_any());
2210         }
2211         return err;
2212 }
2213
2214 static int ip6_route_del(struct fib6_config *cfg,
2215                          struct netlink_ext_ack *extack)
2216 {
2217         struct fib6_table *table;
2218         struct fib6_node *fn;
2219         struct rt6_info *rt;
2220         int err = -ESRCH;
2221
2222         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2223         if (!table) {
2224                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2225                 return err;
2226         }
2227
2228         read_lock_bh(&table->tb6_lock);
2229
2230         fn = fib6_locate(&table->tb6_root,
2231                          &cfg->fc_dst, cfg->fc_dst_len,
2232                          &cfg->fc_src, cfg->fc_src_len);
2233
2234         if (fn) {
2235                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2236                         if ((rt->rt6i_flags & RTF_CACHE) &&
2237                             !(cfg->fc_flags & RTF_CACHE))
2238                                 continue;
2239                         if (cfg->fc_ifindex &&
2240                             (!rt->dst.dev ||
2241                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2242                                 continue;
2243                         if (cfg->fc_flags & RTF_GATEWAY &&
2244                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2245                                 continue;
2246                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2247                                 continue;
2248                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2249                                 continue;
2250                         dst_hold(&rt->dst);
2251                         read_unlock_bh(&table->tb6_lock);
2252
2253                         /* if gateway was specified only delete the one hop */
2254                         if (cfg->fc_flags & RTF_GATEWAY)
2255                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2256
2257                         return __ip6_del_rt_siblings(rt, cfg);
2258                 }
2259         }
2260         read_unlock_bh(&table->tb6_lock);
2261
2262         return err;
2263 }
2264
2265 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2266 {
2267         struct netevent_redirect netevent;
2268         struct rt6_info *rt, *nrt = NULL;
2269         struct ndisc_options ndopts;
2270         struct inet6_dev *in6_dev;
2271         struct neighbour *neigh;
2272         struct rd_msg *msg;
2273         int optlen, on_link;
2274         u8 *lladdr;
2275
2276         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2277         optlen -= sizeof(*msg);
2278
2279         if (optlen < 0) {
2280                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2281                 return;
2282         }
2283
2284         msg = (struct rd_msg *)icmp6_hdr(skb);
2285
2286         if (ipv6_addr_is_multicast(&msg->dest)) {
2287                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2288                 return;
2289         }
2290
2291         on_link = 0;
2292         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2293                 on_link = 1;
2294         } else if (ipv6_addr_type(&msg->target) !=
2295                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2296                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2297                 return;
2298         }
2299
2300         in6_dev = __in6_dev_get(skb->dev);
2301         if (!in6_dev)
2302                 return;
2303         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2304                 return;
2305
2306         /* RFC2461 8.1:
2307          *      The IP source address of the Redirect MUST be the same as the current
2308          *      first-hop router for the specified ICMP Destination Address.
2309          */
2310
2311         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2312                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2313                 return;
2314         }
2315
2316         lladdr = NULL;
2317         if (ndopts.nd_opts_tgt_lladdr) {
2318                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2319                                              skb->dev);
2320                 if (!lladdr) {
2321                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2322                         return;
2323                 }
2324         }
2325
2326         rt = (struct rt6_info *) dst;
2327         if (rt->rt6i_flags & RTF_REJECT) {
2328                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2329                 return;
2330         }
2331
2332         /* Redirect received -> path was valid.
2333          * Look, redirects are sent only in response to data packets,
2334          * so that this nexthop apparently is reachable. --ANK
2335          */
2336         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2337
2338         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2339         if (!neigh)
2340                 return;
2341
2342         /*
2343          *      We have finally decided to accept it.
2344          */
2345
2346         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2347                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2348                      NEIGH_UPDATE_F_OVERRIDE|
2349                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2350                                      NEIGH_UPDATE_F_ISROUTER)),
2351                      NDISC_REDIRECT, &ndopts);
2352
2353         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2354         if (!nrt)
2355                 goto out;
2356
2357         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2358         if (on_link)
2359                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2360
2361         nrt->rt6i_protocol = RTPROT_REDIRECT;
2362         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2363
2364         if (ip6_ins_rt(nrt))
2365                 goto out_release;
2366
2367         netevent.old = &rt->dst;
2368         netevent.new = &nrt->dst;
2369         netevent.daddr = &msg->dest;
2370         netevent.neigh = neigh;
2371         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2372
2373         if (rt->rt6i_flags & RTF_CACHE) {
2374                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2375                 ip6_del_rt(rt);
2376         }
2377
2378 out_release:
2379         /* Release the reference taken in
2380          * ip6_rt_cache_alloc()
2381          */
2382         dst_release(&nrt->dst);
2383
2384 out:
2385         neigh_release(neigh);
2386 }
2387
2388 /*
2389  *      Misc support functions
2390  */
2391
2392 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2393 {
2394         BUG_ON(from->dst.from);
2395
2396         rt->rt6i_flags &= ~RTF_EXPIRES;
2397         dst_hold(&from->dst);
2398         rt->dst.from = &from->dst;
2399         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2400 }
2401
2402 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2403 {
2404         rt->dst.input = ort->dst.input;
2405         rt->dst.output = ort->dst.output;
2406         rt->rt6i_dst = ort->rt6i_dst;
2407         rt->dst.error = ort->dst.error;
2408         rt->rt6i_idev = ort->rt6i_idev;
2409         if (rt->rt6i_idev)
2410                 in6_dev_hold(rt->rt6i_idev);
2411         rt->dst.lastuse = jiffies;
2412         rt->rt6i_gateway = ort->rt6i_gateway;
2413         rt->rt6i_flags = ort->rt6i_flags;
2414         rt6_set_from(rt, ort);
2415         rt->rt6i_metric = ort->rt6i_metric;
2416 #ifdef CONFIG_IPV6_SUBTREES
2417         rt->rt6i_src = ort->rt6i_src;
2418 #endif
2419         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2420         rt->rt6i_table = ort->rt6i_table;
2421         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2422 }
2423
2424 #ifdef CONFIG_IPV6_ROUTE_INFO
2425 static struct rt6_info *rt6_get_route_info(struct net *net,
2426                                            const struct in6_addr *prefix, int prefixlen,
2427                                            const struct in6_addr *gwaddr,
2428                                            struct net_device *dev)
2429 {
2430         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2431         int ifindex = dev->ifindex;
2432         struct fib6_node *fn;
2433         struct rt6_info *rt = NULL;
2434         struct fib6_table *table;
2435
2436         table = fib6_get_table(net, tb_id);
2437         if (!table)
2438                 return NULL;
2439
2440         read_lock_bh(&table->tb6_lock);
2441         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2442         if (!fn)
2443                 goto out;
2444
2445         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2446                 if (rt->dst.dev->ifindex != ifindex)
2447                         continue;
2448                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2449                         continue;
2450                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2451                         continue;
2452                 dst_hold(&rt->dst);
2453                 break;
2454         }
2455 out:
2456         read_unlock_bh(&table->tb6_lock);
2457         return rt;
2458 }
2459
2460 static struct rt6_info *rt6_add_route_info(struct net *net,
2461                                            const struct in6_addr *prefix, int prefixlen,
2462                                            const struct in6_addr *gwaddr,
2463                                            struct net_device *dev,
2464                                            unsigned int pref)
2465 {
2466         struct fib6_config cfg = {
2467                 .fc_metric      = IP6_RT_PRIO_USER,
2468                 .fc_ifindex     = dev->ifindex,
2469                 .fc_dst_len     = prefixlen,
2470                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2471                                   RTF_UP | RTF_PREF(pref),
2472                 .fc_protocol = RTPROT_RA,
2473                 .fc_nlinfo.portid = 0,
2474                 .fc_nlinfo.nlh = NULL,
2475                 .fc_nlinfo.nl_net = net,
2476         };
2477
2478         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2479         cfg.fc_dst = *prefix;
2480         cfg.fc_gateway = *gwaddr;
2481
2482         /* We should treat it as a default route if prefix length is 0. */
2483         if (!prefixlen)
2484                 cfg.fc_flags |= RTF_DEFAULT;
2485
2486         ip6_route_add(&cfg, NULL);
2487
2488         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2489 }
2490 #endif
2491
2492 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2493 {
2494         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2495         struct rt6_info *rt;
2496         struct fib6_table *table;
2497
2498         table = fib6_get_table(dev_net(dev), tb_id);
2499         if (!table)
2500                 return NULL;
2501
2502         read_lock_bh(&table->tb6_lock);
2503         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2504                 if (dev == rt->dst.dev &&
2505                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2506                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2507                         break;
2508         }
2509         if (rt)
2510                 dst_hold(&rt->dst);
2511         read_unlock_bh(&table->tb6_lock);
2512         return rt;
2513 }
2514
2515 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2516                                      struct net_device *dev,
2517                                      unsigned int pref)
2518 {
2519         struct fib6_config cfg = {
2520                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2521                 .fc_metric      = IP6_RT_PRIO_USER,
2522                 .fc_ifindex     = dev->ifindex,
2523                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2524                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2525                 .fc_protocol = RTPROT_RA,
2526                 .fc_nlinfo.portid = 0,
2527                 .fc_nlinfo.nlh = NULL,
2528                 .fc_nlinfo.nl_net = dev_net(dev),
2529         };
2530
2531         cfg.fc_gateway = *gwaddr;
2532
2533         if (!ip6_route_add(&cfg, NULL)) {
2534                 struct fib6_table *table;
2535
2536                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2537                 if (table)
2538                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2539         }
2540
2541         return rt6_get_dflt_router(gwaddr, dev);
2542 }
2543
2544 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2545 {
2546         struct rt6_info *rt;
2547
2548 restart:
2549         read_lock_bh(&table->tb6_lock);
2550         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2551                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2552                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2553                         dst_hold(&rt->dst);
2554                         read_unlock_bh(&table->tb6_lock);
2555                         ip6_del_rt(rt);
2556                         goto restart;
2557                 }
2558         }
2559         read_unlock_bh(&table->tb6_lock);
2560
2561         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2562 }
2563
2564 void rt6_purge_dflt_routers(struct net *net)
2565 {
2566         struct fib6_table *table;
2567         struct hlist_head *head;
2568         unsigned int h;
2569
2570         rcu_read_lock();
2571
2572         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2573                 head = &net->ipv6.fib_table_hash[h];
2574                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2575                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2576                                 __rt6_purge_dflt_routers(table);
2577                 }
2578         }
2579
2580         rcu_read_unlock();
2581 }
2582
2583 static void rtmsg_to_fib6_config(struct net *net,
2584                                  struct in6_rtmsg *rtmsg,
2585                                  struct fib6_config *cfg)
2586 {
2587         memset(cfg, 0, sizeof(*cfg));
2588
2589         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2590                          : RT6_TABLE_MAIN;
2591         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2592         cfg->fc_metric = rtmsg->rtmsg_metric;
2593         cfg->fc_expires = rtmsg->rtmsg_info;
2594         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2595         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2596         cfg->fc_flags = rtmsg->rtmsg_flags;
2597
2598         cfg->fc_nlinfo.nl_net = net;
2599
2600         cfg->fc_dst = rtmsg->rtmsg_dst;
2601         cfg->fc_src = rtmsg->rtmsg_src;
2602         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2603 }
2604
2605 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2606 {
2607         struct fib6_config cfg;
2608         struct in6_rtmsg rtmsg;
2609         int err;
2610
2611         switch (cmd) {
2612         case SIOCADDRT:         /* Add a route */
2613         case SIOCDELRT:         /* Delete a route */
2614                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2615                         return -EPERM;
2616                 err = copy_from_user(&rtmsg, arg,
2617                                      sizeof(struct in6_rtmsg));
2618                 if (err)
2619                         return -EFAULT;
2620
2621                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2622
2623                 rtnl_lock();
2624                 switch (cmd) {
2625                 case SIOCADDRT:
2626                         err = ip6_route_add(&cfg, NULL);
2627                         break;
2628                 case SIOCDELRT:
2629                         err = ip6_route_del(&cfg, NULL);
2630                         break;
2631                 default:
2632                         err = -EINVAL;
2633                 }
2634                 rtnl_unlock();
2635
2636                 return err;
2637         }
2638
2639         return -EINVAL;
2640 }
2641
2642 /*
2643  *      Drop the packet on the floor
2644  */
2645
2646 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2647 {
2648         int type;
2649         struct dst_entry *dst = skb_dst(skb);
2650         switch (ipstats_mib_noroutes) {
2651         case IPSTATS_MIB_INNOROUTES:
2652                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2653                 if (type == IPV6_ADDR_ANY) {
2654                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2655                                       IPSTATS_MIB_INADDRERRORS);
2656                         break;
2657                 }
2658                 /* FALLTHROUGH */
2659         case IPSTATS_MIB_OUTNOROUTES:
2660                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2661                               ipstats_mib_noroutes);
2662                 break;
2663         }
2664         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2665         kfree_skb(skb);
2666         return 0;
2667 }
2668
2669 static int ip6_pkt_discard(struct sk_buff *skb)
2670 {
2671         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2672 }
2673
2674 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2675 {
2676         skb->dev = skb_dst(skb)->dev;
2677         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2678 }
2679
2680 static int ip6_pkt_prohibit(struct sk_buff *skb)
2681 {
2682         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2683 }
2684
2685 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2686 {
2687         skb->dev = skb_dst(skb)->dev;
2688         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2689 }
2690
2691 /*
2692  *      Allocate a dst for local (unicast / anycast) address.
2693  */
2694
2695 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2696                                     const struct in6_addr *addr,
2697                                     bool anycast)
2698 {
2699         u32 tb_id;
2700         struct net *net = dev_net(idev->dev);
2701         struct net_device *dev = net->loopback_dev;
2702         struct rt6_info *rt;
2703
2704         /* use L3 Master device as loopback for host routes if device
2705          * is enslaved and address is not link local or multicast
2706          */
2707         if (!rt6_need_strict(addr))
2708                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2709
2710         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2711         if (!rt)
2712                 return ERR_PTR(-ENOMEM);
2713
2714         in6_dev_hold(idev);
2715
2716         rt->dst.flags |= DST_HOST;
2717         rt->dst.input = ip6_input;
2718         rt->dst.output = ip6_output;
2719         rt->rt6i_idev = idev;
2720
2721         rt->rt6i_protocol = RTPROT_KERNEL;
2722         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2723         if (anycast)
2724                 rt->rt6i_flags |= RTF_ANYCAST;
2725         else
2726                 rt->rt6i_flags |= RTF_LOCAL;
2727
2728         rt->rt6i_gateway  = *addr;
2729         rt->rt6i_dst.addr = *addr;
2730         rt->rt6i_dst.plen = 128;
2731         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2732         rt->rt6i_table = fib6_get_table(net, tb_id);
2733
2734         return rt;
2735 }
2736
2737 /* remove deleted ip from prefsrc entries */
2738 struct arg_dev_net_ip {
2739         struct net_device *dev;
2740         struct net *net;
2741         struct in6_addr *addr;
2742 };
2743
2744 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2745 {
2746         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2747         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2748         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2749
2750         if (((void *)rt->dst.dev == dev || !dev) &&
2751             rt != net->ipv6.ip6_null_entry &&
2752             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2753                 /* remove prefsrc entry */
2754                 rt->rt6i_prefsrc.plen = 0;
2755         }
2756         return 0;
2757 }
2758
2759 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2760 {
2761         struct net *net = dev_net(ifp->idev->dev);
2762         struct arg_dev_net_ip adni = {
2763                 .dev = ifp->idev->dev,
2764                 .net = net,
2765                 .addr = &ifp->addr,
2766         };
2767         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2768 }
2769
2770 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2771 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2772
2773 /* Remove routers and update dst entries when gateway turn into host. */
2774 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2775 {
2776         struct in6_addr *gateway = (struct in6_addr *)arg;
2777
2778         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2779              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2780              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2781                 return -1;
2782         }
2783         return 0;
2784 }
2785
2786 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2787 {
2788         fib6_clean_all(net, fib6_clean_tohost, gateway);
2789 }
2790
2791 struct arg_dev_net {
2792         struct net_device *dev;
2793         struct net *net;
2794 };
2795
2796 /* called with write lock held for table with rt */
2797 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2798 {
2799         const struct arg_dev_net *adn = arg;
2800         const struct net_device *dev = adn->dev;
2801
2802         if ((rt->dst.dev == dev || !dev) &&
2803             rt != adn->net->ipv6.ip6_null_entry &&
2804             (rt->rt6i_nsiblings == 0 ||
2805              (dev && netdev_unregistering(dev)) ||
2806              !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2807                 return -1;
2808
2809         return 0;
2810 }
2811
2812 void rt6_ifdown(struct net *net, struct net_device *dev)
2813 {
2814         struct arg_dev_net adn = {
2815                 .dev = dev,
2816                 .net = net,
2817         };
2818
2819         fib6_clean_all(net, fib6_ifdown, &adn);
2820         if (dev)
2821                 rt6_uncached_list_flush_dev(net, dev);
2822 }
2823
2824 struct rt6_mtu_change_arg {
2825         struct net_device *dev;
2826         unsigned int mtu;
2827 };
2828
2829 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2830 {
2831         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2832         struct inet6_dev *idev;
2833
2834         /* In IPv6 pmtu discovery is not optional,
2835            so that RTAX_MTU lock cannot disable it.
2836            We still use this lock to block changes
2837            caused by addrconf/ndisc.
2838         */
2839
2840         idev = __in6_dev_get(arg->dev);
2841         if (!idev)
2842                 return 0;
2843
2844         /* For administrative MTU increase, there is no way to discover
2845            IPv6 PMTU increase, so PMTU increase should be updated here.
2846            Since RFC 1981 doesn't include administrative MTU increase
2847            update PMTU increase is a MUST. (i.e. jumbo frame)
2848          */
2849         /*
2850            If new MTU is less than route PMTU, this new MTU will be the
2851            lowest MTU in the path, update the route PMTU to reflect PMTU
2852            decreases; if new MTU is greater than route PMTU, and the
2853            old MTU is the lowest MTU in the path, update the route PMTU
2854            to reflect the increase. In this case if the other nodes' MTU
2855            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2856            PMTU discovery.
2857          */
2858         if (rt->dst.dev == arg->dev &&
2859             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2860             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2861                 if (rt->rt6i_flags & RTF_CACHE) {
2862                         /* For RTF_CACHE with rt6i_pmtu == 0
2863                          * (i.e. a redirected route),
2864                          * the metrics of its rt->dst.from has already
2865                          * been updated.
2866                          */
2867                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2868                                 rt->rt6i_pmtu = arg->mtu;
2869                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2870                            (dst_mtu(&rt->dst) < arg->mtu &&
2871                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2872                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2873                 }
2874         }
2875         return 0;
2876 }
2877
2878 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2879 {
2880         struct rt6_mtu_change_arg arg = {
2881                 .dev = dev,
2882                 .mtu = mtu,
2883         };
2884
2885         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2886 }
2887
2888 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2889         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2890         [RTA_OIF]               = { .type = NLA_U32 },
2891         [RTA_IIF]               = { .type = NLA_U32 },
2892         [RTA_PRIORITY]          = { .type = NLA_U32 },
2893         [RTA_METRICS]           = { .type = NLA_NESTED },
2894         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2895         [RTA_PREF]              = { .type = NLA_U8 },
2896         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2897         [RTA_ENCAP]             = { .type = NLA_NESTED },
2898         [RTA_EXPIRES]           = { .type = NLA_U32 },
2899         [RTA_UID]               = { .type = NLA_U32 },
2900         [RTA_MARK]              = { .type = NLA_U32 },
2901 };
2902
2903 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2904                               struct fib6_config *cfg,
2905                               struct netlink_ext_ack *extack)
2906 {
2907         struct rtmsg *rtm;
2908         struct nlattr *tb[RTA_MAX+1];
2909         unsigned int pref;
2910         int err;
2911
2912         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2913                           NULL);
2914         if (err < 0)
2915                 goto errout;
2916
2917         err = -EINVAL;
2918         rtm = nlmsg_data(nlh);
2919         memset(cfg, 0, sizeof(*cfg));
2920
2921         cfg->fc_table = rtm->rtm_table;
2922         cfg->fc_dst_len = rtm->rtm_dst_len;
2923         cfg->fc_src_len = rtm->rtm_src_len;
2924         cfg->fc_flags = RTF_UP;
2925         cfg->fc_protocol = rtm->rtm_protocol;
2926         cfg->fc_type = rtm->rtm_type;
2927
2928         if (rtm->rtm_type == RTN_UNREACHABLE ||
2929             rtm->rtm_type == RTN_BLACKHOLE ||
2930             rtm->rtm_type == RTN_PROHIBIT ||
2931             rtm->rtm_type == RTN_THROW)
2932                 cfg->fc_flags |= RTF_REJECT;
2933
2934         if (rtm->rtm_type == RTN_LOCAL)
2935                 cfg->fc_flags |= RTF_LOCAL;
2936
2937         if (rtm->rtm_flags & RTM_F_CLONED)
2938                 cfg->fc_flags |= RTF_CACHE;
2939
2940         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2941         cfg->fc_nlinfo.nlh = nlh;
2942         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2943
2944         if (tb[RTA_GATEWAY]) {
2945                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2946                 cfg->fc_flags |= RTF_GATEWAY;
2947         }
2948
2949         if (tb[RTA_DST]) {
2950                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2951
2952                 if (nla_len(tb[RTA_DST]) < plen)
2953                         goto errout;
2954
2955                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2956         }
2957
2958         if (tb[RTA_SRC]) {
2959                 int plen = (rtm->rtm_src_len + 7) >> 3;
2960
2961                 if (nla_len(tb[RTA_SRC]) < plen)
2962                         goto errout;
2963
2964                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2965         }
2966
2967         if (tb[RTA_PREFSRC])
2968                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2969
2970         if (tb[RTA_OIF])
2971                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2972
2973         if (tb[RTA_PRIORITY])
2974                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2975
2976         if (tb[RTA_METRICS]) {
2977                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2978                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2979         }
2980
2981         if (tb[RTA_TABLE])
2982                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2983
2984         if (tb[RTA_MULTIPATH]) {
2985                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2986                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2987
2988                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2989                                                      cfg->fc_mp_len, extack);
2990                 if (err < 0)
2991                         goto errout;
2992         }
2993
2994         if (tb[RTA_PREF]) {
2995                 pref = nla_get_u8(tb[RTA_PREF]);
2996                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2997                     pref != ICMPV6_ROUTER_PREF_HIGH)
2998                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2999                 cfg->fc_flags |= RTF_PREF(pref);
3000         }
3001
3002         if (tb[RTA_ENCAP])
3003                 cfg->fc_encap = tb[RTA_ENCAP];
3004
3005         if (tb[RTA_ENCAP_TYPE]) {
3006                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3007
3008                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3009                 if (err < 0)
3010                         goto errout;
3011         }
3012
3013         if (tb[RTA_EXPIRES]) {
3014                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3015
3016                 if (addrconf_finite_timeout(timeout)) {
3017                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3018                         cfg->fc_flags |= RTF_EXPIRES;
3019                 }
3020         }
3021
3022         err = 0;
3023 errout:
3024         return err;
3025 }
3026
3027 struct rt6_nh {
3028         struct rt6_info *rt6_info;
3029         struct fib6_config r_cfg;
3030         struct mx6_config mxc;
3031         struct list_head next;
3032 };
3033
3034 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3035 {
3036         struct rt6_nh *nh;
3037
3038         list_for_each_entry(nh, rt6_nh_list, next) {
3039                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3040                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3041                         nh->r_cfg.fc_ifindex);
3042         }
3043 }
3044
3045 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3046                                  struct rt6_info *rt, struct fib6_config *r_cfg)
3047 {
3048         struct rt6_nh *nh;
3049         int err = -EEXIST;
3050
3051         list_for_each_entry(nh, rt6_nh_list, next) {
3052                 /* check if rt6_info already exists */
3053                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3054                         return err;
3055         }
3056
3057         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3058         if (!nh)
3059                 return -ENOMEM;
3060         nh->rt6_info = rt;
3061         err = ip6_convert_metrics(&nh->mxc, r_cfg);
3062         if (err) {
3063                 kfree(nh);
3064                 return err;
3065         }
3066         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3067         list_add_tail(&nh->next, rt6_nh_list);
3068
3069         return 0;
3070 }
3071
3072 static void ip6_route_mpath_notify(struct rt6_info *rt,
3073                                    struct rt6_info *rt_last,
3074                                    struct nl_info *info,
3075                                    __u16 nlflags)
3076 {
3077         /* if this is an APPEND route, then rt points to the first route
3078          * inserted and rt_last points to last route inserted. Userspace
3079          * wants a consistent dump of the route which starts at the first
3080          * nexthop. Since sibling routes are always added at the end of
3081          * the list, find the first sibling of the last route appended
3082          */
3083         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3084                 rt = list_first_entry(&rt_last->rt6i_siblings,
3085                                       struct rt6_info,
3086                                       rt6i_siblings);
3087         }
3088
3089         if (rt)
3090                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3091 }
3092
3093 static int ip6_route_multipath_add(struct fib6_config *cfg,
3094                                    struct netlink_ext_ack *extack)
3095 {
3096         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3097         struct nl_info *info = &cfg->fc_nlinfo;
3098         struct fib6_config r_cfg;
3099         struct rtnexthop *rtnh;
3100         struct rt6_info *rt;
3101         struct rt6_nh *err_nh;
3102         struct rt6_nh *nh, *nh_safe;
3103         __u16 nlflags;
3104         int remaining;
3105         int attrlen;
3106         int err = 1;
3107         int nhn = 0;
3108         int replace = (cfg->fc_nlinfo.nlh &&
3109                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3110         LIST_HEAD(rt6_nh_list);
3111
3112         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3113         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3114                 nlflags |= NLM_F_APPEND;
3115
3116         remaining = cfg->fc_mp_len;
3117         rtnh = (struct rtnexthop *)cfg->fc_mp;
3118
3119         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3120          * rt6_info structs per nexthop
3121          */
3122         while (rtnh_ok(rtnh, remaining)) {
3123                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3124                 if (rtnh->rtnh_ifindex)
3125                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3126
3127                 attrlen = rtnh_attrlen(rtnh);
3128                 if (attrlen > 0) {
3129                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3130
3131                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3132                         if (nla) {
3133                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3134                                 r_cfg.fc_flags |= RTF_GATEWAY;
3135                         }
3136                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3137                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3138                         if (nla)
3139                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3140                 }
3141
3142                 rt = ip6_route_info_create(&r_cfg, extack);
3143                 if (IS_ERR(rt)) {
3144                         err = PTR_ERR(rt);
3145                         rt = NULL;
3146                         goto cleanup;
3147                 }
3148
3149                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3150                 if (err) {
3151                         dst_release_immediate(&rt->dst);
3152                         goto cleanup;
3153                 }
3154
3155                 rtnh = rtnh_next(rtnh, &remaining);
3156         }
3157
3158         /* for add and replace send one notification with all nexthops.
3159          * Skip the notification in fib6_add_rt2node and send one with
3160          * the full route when done
3161          */
3162         info->skip_notify = 1;
3163
3164         err_nh = NULL;
3165         list_for_each_entry(nh, &rt6_nh_list, next) {
3166                 rt_last = nh->rt6_info;
3167                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3168                 /* save reference to first route for notification */
3169                 if (!rt_notif && !err)
3170                         rt_notif = nh->rt6_info;
3171
3172                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3173                 nh->rt6_info = NULL;
3174                 if (err) {
3175                         if (replace && nhn)
3176                                 ip6_print_replace_route_err(&rt6_nh_list);
3177                         err_nh = nh;
3178                         goto add_errout;
3179                 }
3180
3181                 /* Because each route is added like a single route we remove
3182                  * these flags after the first nexthop: if there is a collision,
3183                  * we have already failed to add the first nexthop:
3184                  * fib6_add_rt2node() has rejected it; when replacing, old
3185                  * nexthops have been replaced by first new, the rest should
3186                  * be added to it.
3187                  */
3188                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3189                                                      NLM_F_REPLACE);
3190                 nhn++;
3191         }
3192
3193         /* success ... tell user about new route */
3194         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3195         goto cleanup;
3196
3197 add_errout:
3198         /* send notification for routes that were added so that
3199          * the delete notifications sent by ip6_route_del are
3200          * coherent
3201          */
3202         if (rt_notif)
3203                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3204
3205         /* Delete routes that were already added */
3206         list_for_each_entry(nh, &rt6_nh_list, next) {
3207                 if (err_nh == nh)
3208                         break;
3209                 ip6_route_del(&nh->r_cfg, extack);
3210         }
3211
3212 cleanup:
3213         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3214                 if (nh->rt6_info)
3215                         dst_release_immediate(&nh->rt6_info->dst);
3216                 kfree(nh->mxc.mx);
3217                 list_del(&nh->next);
3218                 kfree(nh);
3219         }
3220
3221         return err;
3222 }
3223
3224 static int ip6_route_multipath_del(struct fib6_config *cfg,
3225                                    struct netlink_ext_ack *extack)
3226 {
3227         struct fib6_config r_cfg;
3228         struct rtnexthop *rtnh;
3229         int remaining;
3230         int attrlen;
3231         int err = 1, last_err = 0;
3232
3233         remaining = cfg->fc_mp_len;
3234         rtnh = (struct rtnexthop *)cfg->fc_mp;
3235
3236         /* Parse a Multipath Entry */
3237         while (rtnh_ok(rtnh, remaining)) {
3238                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3239                 if (rtnh->rtnh_ifindex)
3240                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3241
3242                 attrlen = rtnh_attrlen(rtnh);
3243                 if (attrlen > 0) {
3244                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3245
3246                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3247                         if (nla) {
3248                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3249                                 r_cfg.fc_flags |= RTF_GATEWAY;
3250                         }
3251                 }
3252                 err = ip6_route_del(&r_cfg, extack);
3253                 if (err)
3254                         last_err = err;
3255
3256                 rtnh = rtnh_next(rtnh, &remaining);
3257         }
3258
3259         return last_err;
3260 }
3261
3262 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3263                               struct netlink_ext_ack *extack)
3264 {
3265         struct fib6_config cfg;
3266         int err;
3267
3268         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3269         if (err < 0)
3270                 return err;
3271
3272         if (cfg.fc_mp)
3273                 return ip6_route_multipath_del(&cfg, extack);
3274         else {
3275                 cfg.fc_delete_all_nh = 1;
3276                 return ip6_route_del(&cfg, extack);
3277         }
3278 }
3279
3280 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3281                               struct netlink_ext_ack *extack)
3282 {
3283         struct fib6_config cfg;
3284         int err;
3285
3286         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3287         if (err < 0)
3288                 return err;
3289
3290         if (cfg.fc_mp)
3291                 return ip6_route_multipath_add(&cfg, extack);
3292         else
3293                 return ip6_route_add(&cfg, extack);
3294 }
3295
3296 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3297 {
3298         int nexthop_len = 0;
3299
3300         if (rt->rt6i_nsiblings) {
3301                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3302                             + NLA_ALIGN(sizeof(struct rtnexthop))
3303                             + nla_total_size(16) /* RTA_GATEWAY */
3304                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
3305
3306                 nexthop_len *= rt->rt6i_nsiblings;
3307         }
3308
3309         return NLMSG_ALIGN(sizeof(struct rtmsg))
3310                + nla_total_size(16) /* RTA_SRC */
3311                + nla_total_size(16) /* RTA_DST */
3312                + nla_total_size(16) /* RTA_GATEWAY */
3313                + nla_total_size(16) /* RTA_PREFSRC */
3314                + nla_total_size(4) /* RTA_TABLE */
3315                + nla_total_size(4) /* RTA_IIF */
3316                + nla_total_size(4) /* RTA_OIF */
3317                + nla_total_size(4) /* RTA_PRIORITY */
3318                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3319                + nla_total_size(sizeof(struct rta_cacheinfo))
3320                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3321                + nla_total_size(1) /* RTA_PREF */
3322                + lwtunnel_get_encap_size(rt->dst.lwtstate)
3323                + nexthop_len;
3324 }
3325
3326 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3327                             unsigned int *flags, bool skip_oif)
3328 {
3329         if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3330                 *flags |= RTNH_F_LINKDOWN;
3331                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3332                         *flags |= RTNH_F_DEAD;
3333         }
3334
3335         if (rt->rt6i_flags & RTF_GATEWAY) {
3336                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3337                         goto nla_put_failure;
3338         }
3339
3340         /* not needed for multipath encoding b/c it has a rtnexthop struct */
3341         if (!skip_oif && rt->dst.dev &&
3342             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3343                 goto nla_put_failure;
3344
3345         if (rt->dst.lwtstate &&
3346             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3347                 goto nla_put_failure;
3348
3349         return 0;
3350
3351 nla_put_failure:
3352         return -EMSGSIZE;
3353 }
3354
3355 /* add multipath next hop */
3356 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3357 {
3358         struct rtnexthop *rtnh;
3359         unsigned int flags = 0;
3360
3361         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3362         if (!rtnh)
3363                 goto nla_put_failure;
3364
3365         rtnh->rtnh_hops = 0;
3366         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3367
3368         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3369                 goto nla_put_failure;
3370
3371         rtnh->rtnh_flags = flags;
3372
3373         /* length of rtnetlink header + attributes */
3374         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3375
3376         return 0;
3377
3378 nla_put_failure:
3379         return -EMSGSIZE;
3380 }
3381
3382 static int rt6_fill_node(struct net *net,
3383                          struct sk_buff *skb, struct rt6_info *rt,
3384                          struct in6_addr *dst, struct in6_addr *src,
3385                          int iif, int type, u32 portid, u32 seq,
3386                          unsigned int flags)
3387 {
3388         u32 metrics[RTAX_MAX];
3389         struct rtmsg *rtm;
3390         struct nlmsghdr *nlh;
3391         long expires;
3392         u32 table;
3393
3394         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3395         if (!nlh)
3396                 return -EMSGSIZE;
3397
3398         rtm = nlmsg_data(nlh);
3399         rtm->rtm_family = AF_INET6;
3400         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3401         rtm->rtm_src_len = rt->rt6i_src.plen;
3402         rtm->rtm_tos = 0;
3403         if (rt->rt6i_table)
3404                 table = rt->rt6i_table->tb6_id;
3405         else
3406                 table = RT6_TABLE_UNSPEC;
3407         rtm->rtm_table = table;
3408         if (nla_put_u32(skb, RTA_TABLE, table))
3409                 goto nla_put_failure;
3410         if (rt->rt6i_flags & RTF_REJECT) {
3411                 switch (rt->dst.error) {
3412                 case -EINVAL:
3413                         rtm->rtm_type = RTN_BLACKHOLE;
3414                         break;
3415                 case -EACCES:
3416                         rtm->rtm_type = RTN_PROHIBIT;
3417                         break;
3418                 case -EAGAIN:
3419                         rtm->rtm_type = RTN_THROW;
3420                         break;
3421                 default:
3422                         rtm->rtm_type = RTN_UNREACHABLE;
3423                         break;
3424                 }
3425         }
3426         else if (rt->rt6i_flags & RTF_LOCAL)
3427                 rtm->rtm_type = RTN_LOCAL;
3428         else if (rt->rt6i_flags & RTF_ANYCAST)
3429                 rtm->rtm_type = RTN_ANYCAST;
3430         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3431                 rtm->rtm_type = RTN_LOCAL;
3432         else
3433                 rtm->rtm_type = RTN_UNICAST;
3434         rtm->rtm_flags = 0;
3435         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3436         rtm->rtm_protocol = rt->rt6i_protocol;
3437
3438         if (rt->rt6i_flags & RTF_CACHE)
3439                 rtm->rtm_flags |= RTM_F_CLONED;
3440
3441         if (dst) {
3442                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3443                         goto nla_put_failure;
3444                 rtm->rtm_dst_len = 128;
3445         } else if (rtm->rtm_dst_len)
3446                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3447                         goto nla_put_failure;
3448 #ifdef CONFIG_IPV6_SUBTREES
3449         if (src) {
3450                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3451                         goto nla_put_failure;
3452                 rtm->rtm_src_len = 128;
3453         } else if (rtm->rtm_src_len &&
3454                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3455                 goto nla_put_failure;
3456 #endif
3457         if (iif) {
3458 #ifdef CONFIG_IPV6_MROUTE
3459                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3460                         int err = ip6mr_get_route(net, skb, rtm, portid);
3461
3462                         if (err == 0)
3463                                 return 0;
3464                         if (err < 0)
3465                                 goto nla_put_failure;
3466                 } else
3467 #endif
3468                         if (nla_put_u32(skb, RTA_IIF, iif))
3469                                 goto nla_put_failure;
3470         } else if (dst) {
3471                 struct in6_addr saddr_buf;
3472                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3473                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3474                         goto nla_put_failure;
3475         }
3476
3477         if (rt->rt6i_prefsrc.plen) {
3478                 struct in6_addr saddr_buf;
3479                 saddr_buf = rt->rt6i_prefsrc.addr;
3480                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3481                         goto nla_put_failure;
3482         }
3483
3484         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3485         if (rt->rt6i_pmtu)
3486                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3487         if (rtnetlink_put_metrics(skb, metrics) < 0)
3488                 goto nla_put_failure;
3489
3490         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3491                 goto nla_put_failure;
3492
3493         /* For multipath routes, walk the siblings list and add
3494          * each as a nexthop within RTA_MULTIPATH.
3495          */
3496         if (rt->rt6i_nsiblings) {
3497                 struct rt6_info *sibling, *next_sibling;
3498                 struct nlattr *mp;
3499
3500                 mp = nla_nest_start(skb, RTA_MULTIPATH);
3501                 if (!mp)
3502                         goto nla_put_failure;
3503
3504                 if (rt6_add_nexthop(skb, rt) < 0)
3505                         goto nla_put_failure;
3506
3507                 list_for_each_entry_safe(sibling, next_sibling,
3508                                          &rt->rt6i_siblings, rt6i_siblings) {
3509                         if (rt6_add_nexthop(skb, sibling) < 0)
3510                                 goto nla_put_failure;
3511                 }
3512
3513                 nla_nest_end(skb, mp);
3514         } else {
3515                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3516                         goto nla_put_failure;
3517         }
3518
3519         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3520
3521         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3522                 goto nla_put_failure;
3523
3524         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3525                 goto nla_put_failure;
3526
3527
3528         nlmsg_end(skb, nlh);
3529         return 0;
3530
3531 nla_put_failure:
3532         nlmsg_cancel(skb, nlh);
3533         return -EMSGSIZE;
3534 }
3535
3536 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3537 {
3538         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3539         struct net *net = arg->net;
3540
3541         if (rt == net->ipv6.ip6_null_entry)
3542                 return 0;
3543
3544         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3545                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3546
3547                 /* user wants prefix routes only */
3548                 if (rtm->rtm_flags & RTM_F_PREFIX &&
3549                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3550                         /* success since this is not a prefix route */
3551                         return 1;
3552                 }
3553         }
3554
3555         return rt6_fill_node(net,
3556                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3557                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3558                      NLM_F_MULTI);
3559 }
3560
3561 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3562                               struct netlink_ext_ack *extack)
3563 {
3564         struct net *net = sock_net(in_skb->sk);
3565         struct nlattr *tb[RTA_MAX+1];
3566         int err, iif = 0, oif = 0;
3567         struct dst_entry *dst;
3568         struct rt6_info *rt;
3569         struct sk_buff *skb;
3570         struct rtmsg *rtm;
3571         struct flowi6 fl6;
3572         bool fibmatch;
3573
3574         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3575                           extack);
3576         if (err < 0)
3577                 goto errout;
3578
3579         err = -EINVAL;
3580         memset(&fl6, 0, sizeof(fl6));
3581         rtm = nlmsg_data(nlh);
3582         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3583         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3584
3585         if (tb[RTA_SRC]) {
3586                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3587                         goto errout;
3588
3589                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3590         }
3591
3592         if (tb[RTA_DST]) {
3593                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3594                         goto errout;
3595
3596                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3597         }
3598
3599         if (tb[RTA_IIF])
3600                 iif = nla_get_u32(tb[RTA_IIF]);
3601
3602         if (tb[RTA_OIF])
3603                 oif = nla_get_u32(tb[RTA_OIF]);
3604
3605         if (tb[RTA_MARK])
3606                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3607
3608         if (tb[RTA_UID])
3609                 fl6.flowi6_uid = make_kuid(current_user_ns(),
3610                                            nla_get_u32(tb[RTA_UID]));
3611         else
3612                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3613
3614         if (iif) {
3615                 struct net_device *dev;
3616                 int flags = 0;
3617
3618                 dev = __dev_get_by_index(net, iif);
3619                 if (!dev) {
3620                         err = -ENODEV;
3621                         goto errout;
3622                 }
3623
3624                 fl6.flowi6_iif = iif;
3625
3626                 if (!ipv6_addr_any(&fl6.saddr))
3627                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3628
3629                 if (!fibmatch)
3630                         dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3631         } else {
3632                 fl6.flowi6_oif = oif;
3633
3634                 if (!fibmatch)
3635                         dst = ip6_route_output(net, NULL, &fl6);
3636         }
3637
3638         if (fibmatch)
3639                 dst = ip6_route_lookup(net, &fl6, 0);
3640
3641         rt = container_of(dst, struct rt6_info, dst);
3642         if (rt->dst.error) {
3643                 err = rt->dst.error;
3644                 ip6_rt_put(rt);
3645                 goto errout;
3646         }
3647
3648         if (rt == net->ipv6.ip6_null_entry) {
3649                 err = rt->dst.error;
3650                 ip6_rt_put(rt);
3651                 goto errout;
3652         }
3653
3654         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3655         if (!skb) {
3656                 ip6_rt_put(rt);
3657                 err = -ENOBUFS;
3658                 goto errout;
3659         }
3660
3661         skb_dst_set(skb, &rt->dst);
3662         if (fibmatch)
3663                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3664                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3665                                     nlh->nlmsg_seq, 0);
3666         else
3667                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3668                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3669                                     nlh->nlmsg_seq, 0);
3670         if (err < 0) {
3671                 kfree_skb(skb);
3672                 goto errout;
3673         }
3674
3675         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3676 errout:
3677         return err;
3678 }
3679
3680 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3681                      unsigned int nlm_flags)
3682 {
3683         struct sk_buff *skb;
3684         struct net *net = info->nl_net;
3685         u32 seq;
3686         int err;
3687
3688         err = -ENOBUFS;
3689         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3690
3691         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3692         if (!skb)
3693                 goto errout;
3694
3695         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3696                                 event, info->portid, seq, nlm_flags);
3697         if (err < 0) {
3698                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3699                 WARN_ON(err == -EMSGSIZE);
3700                 kfree_skb(skb);
3701                 goto errout;
3702         }
3703         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3704                     info->nlh, gfp_any());
3705         return;
3706 errout:
3707         if (err < 0)
3708                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3709 }
3710
3711 static int ip6_route_dev_notify(struct notifier_block *this,
3712                                 unsigned long event, void *ptr)
3713 {
3714         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3715         struct net *net = dev_net(dev);
3716
3717         if (!(dev->flags & IFF_LOOPBACK))
3718                 return NOTIFY_OK;
3719
3720         if (event == NETDEV_REGISTER) {
3721                 net->ipv6.ip6_null_entry->dst.dev = dev;
3722                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3723 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3724                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3725                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3726                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3727                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3728 #endif
3729          } else if (event == NETDEV_UNREGISTER &&
3730                     dev->reg_state != NETREG_UNREGISTERED) {
3731                 /* NETDEV_UNREGISTER could be fired for multiple times by
3732                  * netdev_wait_allrefs(). Make sure we only call this once.
3733                  */
3734                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
3735 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3736                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
3737                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3738 #endif
3739         }
3740
3741         return NOTIFY_OK;
3742 }
3743
3744 /*
3745  *      /proc
3746  */
3747
3748 #ifdef CONFIG_PROC_FS
3749
3750 static const struct file_operations ipv6_route_proc_fops = {
3751         .owner          = THIS_MODULE,
3752         .open           = ipv6_route_open,
3753         .read           = seq_read,
3754         .llseek         = seq_lseek,
3755         .release        = seq_release_net,
3756 };
3757
3758 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3759 {
3760         struct net *net = (struct net *)seq->private;
3761         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3762                    net->ipv6.rt6_stats->fib_nodes,
3763                    net->ipv6.rt6_stats->fib_route_nodes,
3764                    net->ipv6.rt6_stats->fib_rt_alloc,
3765                    net->ipv6.rt6_stats->fib_rt_entries,
3766                    net->ipv6.rt6_stats->fib_rt_cache,
3767                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3768                    net->ipv6.rt6_stats->fib_discarded_routes);
3769
3770         return 0;
3771 }
3772
3773 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3774 {
3775         return single_open_net(inode, file, rt6_stats_seq_show);
3776 }
3777
3778 static const struct file_operations rt6_stats_seq_fops = {
3779         .owner   = THIS_MODULE,
3780         .open    = rt6_stats_seq_open,
3781         .read    = seq_read,
3782         .llseek  = seq_lseek,
3783         .release = single_release_net,
3784 };
3785 #endif  /* CONFIG_PROC_FS */
3786
3787 #ifdef CONFIG_SYSCTL
3788
3789 static
3790 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3791                               void __user *buffer, size_t *lenp, loff_t *ppos)
3792 {
3793         struct net *net;
3794         int delay;
3795         if (!write)
3796                 return -EINVAL;
3797
3798         net = (struct net *)ctl->extra1;
3799         delay = net->ipv6.sysctl.flush_delay;
3800         proc_dointvec(ctl, write, buffer, lenp, ppos);
3801         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3802         return 0;
3803 }
3804
3805 struct ctl_table ipv6_route_table_template[] = {
3806         {
3807                 .procname       =       "flush",
3808                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3809                 .maxlen         =       sizeof(int),
3810                 .mode           =       0200,
3811                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3812         },
3813         {
3814                 .procname       =       "gc_thresh",
3815                 .data           =       &ip6_dst_ops_template.gc_thresh,
3816                 .maxlen         =       sizeof(int),
3817                 .mode           =       0644,
3818                 .proc_handler   =       proc_dointvec,
3819         },
3820         {
3821                 .procname       =       "max_size",
3822                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3823                 .maxlen         =       sizeof(int),
3824                 .mode           =       0644,
3825                 .proc_handler   =       proc_dointvec,
3826         },
3827         {
3828                 .procname       =       "gc_min_interval",
3829                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3830                 .maxlen         =       sizeof(int),
3831                 .mode           =       0644,
3832                 .proc_handler   =       proc_dointvec_jiffies,
3833         },
3834         {
3835                 .procname       =       "gc_timeout",
3836                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3837                 .maxlen         =       sizeof(int),
3838                 .mode           =       0644,
3839                 .proc_handler   =       proc_dointvec_jiffies,
3840         },
3841         {
3842                 .procname       =       "gc_interval",
3843                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3844                 .maxlen         =       sizeof(int),
3845                 .mode           =       0644,
3846                 .proc_handler   =       proc_dointvec_jiffies,
3847         },
3848         {
3849                 .procname       =       "gc_elasticity",
3850                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3851                 .maxlen         =       sizeof(int),
3852                 .mode           =       0644,
3853                 .proc_handler   =       proc_dointvec,
3854         },
3855         {
3856                 .procname       =       "mtu_expires",
3857                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3858                 .maxlen         =       sizeof(int),
3859                 .mode           =       0644,
3860                 .proc_handler   =       proc_dointvec_jiffies,
3861         },
3862         {
3863                 .procname       =       "min_adv_mss",
3864                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3865                 .maxlen         =       sizeof(int),
3866                 .mode           =       0644,
3867                 .proc_handler   =       proc_dointvec,
3868         },
3869         {
3870                 .procname       =       "gc_min_interval_ms",
3871                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3872                 .maxlen         =       sizeof(int),
3873                 .mode           =       0644,
3874                 .proc_handler   =       proc_dointvec_ms_jiffies,
3875         },
3876         { }
3877 };
3878
3879 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3880 {
3881         struct ctl_table *table;
3882
3883         table = kmemdup(ipv6_route_table_template,
3884                         sizeof(ipv6_route_table_template),
3885                         GFP_KERNEL);
3886
3887         if (table) {
3888                 table[0].data = &net->ipv6.sysctl.flush_delay;
3889                 table[0].extra1 = net;
3890                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3891                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3892                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3893                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3894                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3895                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3896                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3897                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3898                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3899
3900                 /* Don't export sysctls to unprivileged users */
3901                 if (net->user_ns != &init_user_ns)
3902                         table[0].procname = NULL;
3903         }
3904
3905         return table;
3906 }
3907 #endif
3908
3909 static int __net_init ip6_route_net_init(struct net *net)
3910 {
3911         int ret = -ENOMEM;
3912
3913         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3914                sizeof(net->ipv6.ip6_dst_ops));
3915
3916         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3917                 goto out_ip6_dst_ops;
3918
3919         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3920                                            sizeof(*net->ipv6.ip6_null_entry),
3921                                            GFP_KERNEL);
3922         if (!net->ipv6.ip6_null_entry)
3923                 goto out_ip6_dst_entries;
3924         net->ipv6.ip6_null_entry->dst.path =
3925                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3926         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3927         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3928                          ip6_template_metrics, true);
3929
3930 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3931         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3932                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3933                                                GFP_KERNEL);
3934         if (!net->ipv6.ip6_prohibit_entry)
3935                 goto out_ip6_null_entry;
3936         net->ipv6.ip6_prohibit_entry->dst.path =
3937                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3938         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3939         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3940                          ip6_template_metrics, true);
3941
3942         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3943                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3944                                                GFP_KERNEL);
3945         if (!net->ipv6.ip6_blk_hole_entry)
3946                 goto out_ip6_prohibit_entry;
3947         net->ipv6.ip6_blk_hole_entry->dst.path =
3948                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3949         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3950         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3951                          ip6_template_metrics, true);
3952 #endif
3953
3954         net->ipv6.sysctl.flush_delay = 0;
3955         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3956         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3957         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3958         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3959         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3960         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3961         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3962
3963         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3964
3965         ret = 0;
3966 out:
3967         return ret;
3968
3969 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3970 out_ip6_prohibit_entry:
3971         kfree(net->ipv6.ip6_prohibit_entry);
3972 out_ip6_null_entry:
3973         kfree(net->ipv6.ip6_null_entry);
3974 #endif
3975 out_ip6_dst_entries:
3976         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3977 out_ip6_dst_ops:
3978         goto out;
3979 }
3980
3981 static void __net_exit ip6_route_net_exit(struct net *net)
3982 {
3983         kfree(net->ipv6.ip6_null_entry);
3984 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3985         kfree(net->ipv6.ip6_prohibit_entry);
3986         kfree(net->ipv6.ip6_blk_hole_entry);
3987 #endif
3988         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3989 }
3990
3991 static int __net_init ip6_route_net_init_late(struct net *net)
3992 {
3993 #ifdef CONFIG_PROC_FS
3994         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3995         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3996 #endif
3997         return 0;
3998 }
3999
4000 static void __net_exit ip6_route_net_exit_late(struct net *net)
4001 {
4002 #ifdef CONFIG_PROC_FS
4003         remove_proc_entry("ipv6_route", net->proc_net);
4004         remove_proc_entry("rt6_stats", net->proc_net);
4005 #endif
4006 }
4007
4008 static struct pernet_operations ip6_route_net_ops = {
4009         .init = ip6_route_net_init,
4010         .exit = ip6_route_net_exit,
4011 };
4012
4013 static int __net_init ipv6_inetpeer_init(struct net *net)
4014 {
4015         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4016
4017         if (!bp)
4018                 return -ENOMEM;
4019         inet_peer_base_init(bp);
4020         net->ipv6.peers = bp;
4021         return 0;
4022 }
4023
4024 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4025 {
4026         struct inet_peer_base *bp = net->ipv6.peers;
4027
4028         net->ipv6.peers = NULL;
4029         inetpeer_invalidate_tree(bp);
4030         kfree(bp);
4031 }
4032
4033 static struct pernet_operations ipv6_inetpeer_ops = {
4034         .init   =       ipv6_inetpeer_init,
4035         .exit   =       ipv6_inetpeer_exit,
4036 };
4037
4038 static struct pernet_operations ip6_route_net_late_ops = {
4039         .init = ip6_route_net_init_late,
4040         .exit = ip6_route_net_exit_late,
4041 };
4042
4043 static struct notifier_block ip6_route_dev_notifier = {
4044         .notifier_call = ip6_route_dev_notify,
4045         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4046 };
4047
4048 void __init ip6_route_init_special_entries(void)
4049 {
4050         /* Registering of the loopback is done before this portion of code,
4051          * the loopback reference in rt6_info will not be taken, do it
4052          * manually for init_net */
4053         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4054         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4055   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4056         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4057         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4058         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4059         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4060   #endif
4061 }
4062
4063 int __init ip6_route_init(void)
4064 {
4065         int ret;
4066         int cpu;
4067
4068         ret = -ENOMEM;
4069         ip6_dst_ops_template.kmem_cachep =
4070                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4071                                   SLAB_HWCACHE_ALIGN, NULL);
4072         if (!ip6_dst_ops_template.kmem_cachep)
4073                 goto out;
4074
4075         ret = dst_entries_init(&ip6_dst_blackhole_ops);
4076         if (ret)
4077                 goto out_kmem_cache;
4078
4079         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4080         if (ret)
4081                 goto out_dst_entries;
4082
4083         ret = register_pernet_subsys(&ip6_route_net_ops);
4084         if (ret)
4085                 goto out_register_inetpeer;
4086
4087         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4088
4089         ret = fib6_init();
4090         if (ret)
4091                 goto out_register_subsys;
4092
4093         ret = xfrm6_init();
4094         if (ret)
4095                 goto out_fib6_init;
4096
4097         ret = fib6_rules_init();
4098         if (ret)
4099                 goto xfrm6_init;
4100
4101         ret = register_pernet_subsys(&ip6_route_net_late_ops);
4102         if (ret)
4103                 goto fib6_rules_init;
4104
4105         ret = -ENOBUFS;
4106         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4107             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4108             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4109                 goto out_register_late_subsys;
4110
4111         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4112         if (ret)
4113                 goto out_register_late_subsys;
4114
4115         for_each_possible_cpu(cpu) {
4116                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4117
4118                 INIT_LIST_HEAD(&ul->head);
4119                 spin_lock_init(&ul->lock);
4120         }
4121
4122 out:
4123         return ret;
4124
4125 out_register_late_subsys:
4126         unregister_pernet_subsys(&ip6_route_net_late_ops);
4127 fib6_rules_init:
4128         fib6_rules_cleanup();
4129 xfrm6_init:
4130         xfrm6_fini();
4131 out_fib6_init:
4132         fib6_gc_cleanup();
4133 out_register_subsys:
4134         unregister_pernet_subsys(&ip6_route_net_ops);
4135 out_register_inetpeer:
4136         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4137 out_dst_entries:
4138         dst_entries_destroy(&ip6_dst_blackhole_ops);
4139 out_kmem_cache:
4140         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4141         goto out;
4142 }
4143
4144 void ip6_route_cleanup(void)
4145 {
4146         unregister_netdevice_notifier(&ip6_route_dev_notifier);
4147         unregister_pernet_subsys(&ip6_route_net_late_ops);
4148         fib6_rules_cleanup();
4149         xfrm6_fini();
4150         fib6_gc_cleanup();
4151         unregister_pernet_subsys(&ipv6_inetpeer_ops);
4152         unregister_pernet_subsys(&ip6_route_net_ops);
4153         dst_entries_destroy(&ip6_dst_blackhole_ops);
4154         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4155 }
This page took 0.279269 seconds and 4 git commands to generate.