]> Git Repo - linux.git/blob - net/ipv6/route.c
Merge branch 'i2c/for-4.16' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa...
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <[email protected]>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458
459         /* We might have already computed the hash for ICMPv6 errors. In such
460          * case it will always be non-zero. Otherwise now is the time to do it.
461          */
462         if (!fl6->mp_hash)
463                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
464
465         if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
466                 return match;
467
468         list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
469                                  rt6i_siblings) {
470                 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
471                         continue;
472                 if (rt6_score_route(sibling, oif, strict) < 0)
473                         break;
474                 match = sibling;
475                 break;
476         }
477
478         return match;
479 }
480
481 /*
482  *      Route lookup. rcu_read_lock() should be held.
483  */
484
485 static inline struct rt6_info *rt6_device_match(struct net *net,
486                                                     struct rt6_info *rt,
487                                                     const struct in6_addr *saddr,
488                                                     int oif,
489                                                     int flags)
490 {
491         struct rt6_info *local = NULL;
492         struct rt6_info *sprt;
493
494         if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
495                 return rt;
496
497         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
498                 struct net_device *dev = sprt->dst.dev;
499
500                 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
501                         continue;
502
503                 if (oif) {
504                         if (dev->ifindex == oif)
505                                 return sprt;
506                         if (dev->flags & IFF_LOOPBACK) {
507                                 if (!sprt->rt6i_idev ||
508                                     sprt->rt6i_idev->dev->ifindex != oif) {
509                                         if (flags & RT6_LOOKUP_F_IFACE)
510                                                 continue;
511                                         if (local &&
512                                             local->rt6i_idev->dev->ifindex == oif)
513                                                 continue;
514                                 }
515                                 local = sprt;
516                         }
517                 } else {
518                         if (ipv6_chk_addr(net, saddr, dev,
519                                           flags & RT6_LOOKUP_F_IFACE))
520                                 return sprt;
521                 }
522         }
523
524         if (oif) {
525                 if (local)
526                         return local;
527
528                 if (flags & RT6_LOOKUP_F_IFACE)
529                         return net->ipv6.ip6_null_entry;
530         }
531
532         return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
533 }
534
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537         struct work_struct work;
538         struct in6_addr target;
539         struct net_device *dev;
540 };
541
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544         struct in6_addr mcaddr;
545         struct __rt6_probe_work *work =
546                 container_of(w, struct __rt6_probe_work, work);
547
548         addrconf_addr_solict_mult(&work->target, &mcaddr);
549         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550         dev_put(work->dev);
551         kfree(work);
552 }
553
554 static void rt6_probe(struct rt6_info *rt)
555 {
556         struct __rt6_probe_work *work;
557         struct neighbour *neigh;
558         /*
559          * Okay, this does not seem to be appropriate
560          * for now, however, we need to check if it
561          * is really so; aka Router Reachability Probing.
562          *
563          * Router Reachability Probe MUST be rate-limited
564          * to no more than one per minute.
565          */
566         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567                 return;
568         rcu_read_lock_bh();
569         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570         if (neigh) {
571                 if (neigh->nud_state & NUD_VALID)
572                         goto out;
573
574                 work = NULL;
575                 write_lock(&neigh->lock);
576                 if (!(neigh->nud_state & NUD_VALID) &&
577                     time_after(jiffies,
578                                neigh->updated +
579                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else {
586                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
587         }
588
589         if (work) {
590                 INIT_WORK(&work->work, rt6_probe_deferred);
591                 work->target = rt->rt6i_gateway;
592                 dev_hold(rt->dst.dev);
593                 work->dev = rt->dst.dev;
594                 schedule_work(&work->work);
595         }
596
597 out:
598         rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611         struct net_device *dev = rt->dst.dev;
612         if (!oif || dev->ifindex == oif)
613                 return 2;
614         if ((dev->flags & IFF_LOOPBACK) &&
615             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616                 return 1;
617         return 0;
618 }
619
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622         struct neighbour *neigh;
623         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624
625         if (rt->rt6i_flags & RTF_NONEXTHOP ||
626             !(rt->rt6i_flags & RTF_GATEWAY))
627                 return RT6_NUD_SUCCEED;
628
629         rcu_read_lock_bh();
630         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631         if (neigh) {
632                 read_lock(&neigh->lock);
633                 if (neigh->nud_state & NUD_VALID)
634                         ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636                 else if (!(neigh->nud_state & NUD_FAILED))
637                         ret = RT6_NUD_SUCCEED;
638                 else
639                         ret = RT6_NUD_FAIL_PROBE;
640 #endif
641                 read_unlock(&neigh->lock);
642         } else {
643                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645         }
646         rcu_read_unlock_bh();
647
648         return ret;
649 }
650
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652                            int strict)
653 {
654         int m;
655
656         m = rt6_check_dev(rt, oif);
657         if (!m && (strict & RT6_LOOKUP_F_IFACE))
658                 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662         if (strict & RT6_LOOKUP_F_REACHABLE) {
663                 int n = rt6_check_neigh(rt);
664                 if (n < 0)
665                         return n;
666         }
667         return m;
668 }
669
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671                                    int *mpri, struct rt6_info *match,
672                                    bool *do_rr)
673 {
674         int m;
675         bool match_do_rr = false;
676         struct inet6_dev *idev = rt->rt6i_idev;
677
678         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
679                 goto out;
680
681         if (idev->cnf.ignore_routes_with_linkdown &&
682             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
683             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
684                 goto out;
685
686         if (rt6_check_expired(rt))
687                 goto out;
688
689         m = rt6_score_route(rt, oif, strict);
690         if (m == RT6_NUD_FAIL_DO_RR) {
691                 match_do_rr = true;
692                 m = 0; /* lowest valid score */
693         } else if (m == RT6_NUD_FAIL_HARD) {
694                 goto out;
695         }
696
697         if (strict & RT6_LOOKUP_F_REACHABLE)
698                 rt6_probe(rt);
699
700         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
701         if (m > *mpri) {
702                 *do_rr = match_do_rr;
703                 *mpri = m;
704                 match = rt;
705         }
706 out:
707         return match;
708 }
709
710 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
711                                      struct rt6_info *leaf,
712                                      struct rt6_info *rr_head,
713                                      u32 metric, int oif, int strict,
714                                      bool *do_rr)
715 {
716         struct rt6_info *rt, *match, *cont;
717         int mpri = -1;
718
719         match = NULL;
720         cont = NULL;
721         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
722                 if (rt->rt6i_metric != metric) {
723                         cont = rt;
724                         break;
725                 }
726
727                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728         }
729
730         for (rt = leaf; rt && rt != rr_head;
731              rt = rcu_dereference(rt->rt6_next)) {
732                 if (rt->rt6i_metric != metric) {
733                         cont = rt;
734                         break;
735                 }
736
737                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
738         }
739
740         if (match || !cont)
741                 return match;
742
743         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
744                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
745
746         return match;
747 }
748
749 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
750                                    int oif, int strict)
751 {
752         struct rt6_info *leaf = rcu_dereference(fn->leaf);
753         struct rt6_info *match, *rt0;
754         bool do_rr = false;
755         int key_plen;
756
757         if (!leaf || leaf == net->ipv6.ip6_null_entry)
758                 return net->ipv6.ip6_null_entry;
759
760         rt0 = rcu_dereference(fn->rr_ptr);
761         if (!rt0)
762                 rt0 = leaf;
763
764         /* Double check to make sure fn is not an intermediate node
765          * and fn->leaf does not points to its child's leaf
766          * (This might happen if all routes under fn are deleted from
767          * the tree and fib6_repair_tree() is called on the node.)
768          */
769         key_plen = rt0->rt6i_dst.plen;
770 #ifdef CONFIG_IPV6_SUBTREES
771         if (rt0->rt6i_src.plen)
772                 key_plen = rt0->rt6i_src.plen;
773 #endif
774         if (fn->fn_bit != key_plen)
775                 return net->ipv6.ip6_null_entry;
776
777         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
778                              &do_rr);
779
780         if (do_rr) {
781                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
782
783                 /* no entries matched; do round-robin */
784                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
785                         next = leaf;
786
787                 if (next != rt0) {
788                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
789                         /* make sure next is not being deleted from the tree */
790                         if (next->rt6i_node)
791                                 rcu_assign_pointer(fn->rr_ptr, next);
792                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
793                 }
794         }
795
796         return match ? match : net->ipv6.ip6_null_entry;
797 }
798
799 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
800 {
801         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
802 }
803
804 #ifdef CONFIG_IPV6_ROUTE_INFO
805 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
806                   const struct in6_addr *gwaddr)
807 {
808         struct net *net = dev_net(dev);
809         struct route_info *rinfo = (struct route_info *) opt;
810         struct in6_addr prefix_buf, *prefix;
811         unsigned int pref;
812         unsigned long lifetime;
813         struct rt6_info *rt;
814
815         if (len < sizeof(struct route_info)) {
816                 return -EINVAL;
817         }
818
819         /* Sanity check for prefix_len and length */
820         if (rinfo->length > 3) {
821                 return -EINVAL;
822         } else if (rinfo->prefix_len > 128) {
823                 return -EINVAL;
824         } else if (rinfo->prefix_len > 64) {
825                 if (rinfo->length < 2) {
826                         return -EINVAL;
827                 }
828         } else if (rinfo->prefix_len > 0) {
829                 if (rinfo->length < 1) {
830                         return -EINVAL;
831                 }
832         }
833
834         pref = rinfo->route_pref;
835         if (pref == ICMPV6_ROUTER_PREF_INVALID)
836                 return -EINVAL;
837
838         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
839
840         if (rinfo->length == 3)
841                 prefix = (struct in6_addr *)rinfo->prefix;
842         else {
843                 /* this function is safe */
844                 ipv6_addr_prefix(&prefix_buf,
845                                  (struct in6_addr *)rinfo->prefix,
846                                  rinfo->prefix_len);
847                 prefix = &prefix_buf;
848         }
849
850         if (rinfo->prefix_len == 0)
851                 rt = rt6_get_dflt_router(gwaddr, dev);
852         else
853                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854                                         gwaddr, dev);
855
856         if (rt && !lifetime) {
857                 ip6_del_rt(rt);
858                 rt = NULL;
859         }
860
861         if (!rt && lifetime)
862                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863                                         dev, pref);
864         else if (rt)
865                 rt->rt6i_flags = RTF_ROUTEINFO |
866                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
867
868         if (rt) {
869                 if (!addrconf_finite_timeout(lifetime))
870                         rt6_clean_expires(rt);
871                 else
872                         rt6_set_expires(rt, jiffies + HZ * lifetime);
873
874                 ip6_rt_put(rt);
875         }
876         return 0;
877 }
878 #endif
879
880 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
881                                         struct in6_addr *saddr)
882 {
883         struct fib6_node *pn, *sn;
884         while (1) {
885                 if (fn->fn_flags & RTN_TL_ROOT)
886                         return NULL;
887                 pn = rcu_dereference(fn->parent);
888                 sn = FIB6_SUBTREE(pn);
889                 if (sn && sn != fn)
890                         fn = fib6_lookup(sn, NULL, saddr);
891                 else
892                         fn = pn;
893                 if (fn->fn_flags & RTN_RTINFO)
894                         return fn;
895         }
896 }
897
898 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
899                           bool null_fallback)
900 {
901         struct rt6_info *rt = *prt;
902
903         if (dst_hold_safe(&rt->dst))
904                 return true;
905         if (null_fallback) {
906                 rt = net->ipv6.ip6_null_entry;
907                 dst_hold(&rt->dst);
908         } else {
909                 rt = NULL;
910         }
911         *prt = rt;
912         return false;
913 }
914
915 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
916                                              struct fib6_table *table,
917                                              struct flowi6 *fl6, int flags)
918 {
919         struct rt6_info *rt, *rt_cache;
920         struct fib6_node *fn;
921
922         rcu_read_lock();
923         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
924 restart:
925         rt = rcu_dereference(fn->leaf);
926         if (!rt) {
927                 rt = net->ipv6.ip6_null_entry;
928         } else {
929                 rt = rt6_device_match(net, rt, &fl6->saddr,
930                                       fl6->flowi6_oif, flags);
931                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
932                         rt = rt6_multipath_select(rt, fl6,
933                                                   fl6->flowi6_oif, flags);
934         }
935         if (rt == net->ipv6.ip6_null_entry) {
936                 fn = fib6_backtrack(fn, &fl6->saddr);
937                 if (fn)
938                         goto restart;
939         }
940         /* Search through exception table */
941         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
942         if (rt_cache)
943                 rt = rt_cache;
944
945         if (ip6_hold_safe(net, &rt, true))
946                 dst_use_noref(&rt->dst, jiffies);
947
948         rcu_read_unlock();
949
950         trace_fib6_table_lookup(net, rt, table, fl6);
951
952         return rt;
953
954 }
955
956 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
957                                     int flags)
958 {
959         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
960 }
961 EXPORT_SYMBOL_GPL(ip6_route_lookup);
962
963 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
964                             const struct in6_addr *saddr, int oif, int strict)
965 {
966         struct flowi6 fl6 = {
967                 .flowi6_oif = oif,
968                 .daddr = *daddr,
969         };
970         struct dst_entry *dst;
971         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
972
973         if (saddr) {
974                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
975                 flags |= RT6_LOOKUP_F_HAS_SADDR;
976         }
977
978         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
979         if (dst->error == 0)
980                 return (struct rt6_info *) dst;
981
982         dst_release(dst);
983
984         return NULL;
985 }
986 EXPORT_SYMBOL(rt6_lookup);
987
988 /* ip6_ins_rt is called with FREE table->tb6_lock.
989  * It takes new route entry, the addition fails by any reason the
990  * route is released.
991  * Caller must hold dst before calling it.
992  */
993
994 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
995                         struct mx6_config *mxc,
996                         struct netlink_ext_ack *extack)
997 {
998         int err;
999         struct fib6_table *table;
1000
1001         table = rt->rt6i_table;
1002         spin_lock_bh(&table->tb6_lock);
1003         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1004         spin_unlock_bh(&table->tb6_lock);
1005
1006         return err;
1007 }
1008
1009 int ip6_ins_rt(struct rt6_info *rt)
1010 {
1011         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1012         struct mx6_config mxc = { .mx = NULL, };
1013
1014         /* Hold dst to account for the reference from the fib6 tree */
1015         dst_hold(&rt->dst);
1016         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1017 }
1018
1019 /* called with rcu_lock held */
1020 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1021 {
1022         struct net_device *dev = rt->dst.dev;
1023
1024         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1025                 /* for copies of local routes, dst->dev needs to be the
1026                  * device if it is a master device, the master device if
1027                  * device is enslaved, and the loopback as the default
1028                  */
1029                 if (netif_is_l3_slave(dev) &&
1030                     !rt6_need_strict(&rt->rt6i_dst.addr))
1031                         dev = l3mdev_master_dev_rcu(dev);
1032                 else if (!netif_is_l3_master(dev))
1033                         dev = dev_net(dev)->loopback_dev;
1034                 /* last case is netif_is_l3_master(dev) is true in which
1035                  * case we want dev returned to be dev
1036                  */
1037         }
1038
1039         return dev;
1040 }
1041
1042 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1043                                            const struct in6_addr *daddr,
1044                                            const struct in6_addr *saddr)
1045 {
1046         struct net_device *dev;
1047         struct rt6_info *rt;
1048
1049         /*
1050          *      Clone the route.
1051          */
1052
1053         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1054                 ort = ort->from;
1055
1056         rcu_read_lock();
1057         dev = ip6_rt_get_dev_rcu(ort);
1058         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1059         rcu_read_unlock();
1060         if (!rt)
1061                 return NULL;
1062
1063         ip6_rt_copy_init(rt, ort);
1064         rt->rt6i_flags |= RTF_CACHE;
1065         rt->rt6i_metric = 0;
1066         rt->dst.flags |= DST_HOST;
1067         rt->rt6i_dst.addr = *daddr;
1068         rt->rt6i_dst.plen = 128;
1069
1070         if (!rt6_is_gw_or_nonexthop(ort)) {
1071                 if (ort->rt6i_dst.plen != 128 &&
1072                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1073                         rt->rt6i_flags |= RTF_ANYCAST;
1074 #ifdef CONFIG_IPV6_SUBTREES
1075                 if (rt->rt6i_src.plen && saddr) {
1076                         rt->rt6i_src.addr = *saddr;
1077                         rt->rt6i_src.plen = 128;
1078                 }
1079 #endif
1080         }
1081
1082         return rt;
1083 }
1084
1085 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1086 {
1087         struct net_device *dev;
1088         struct rt6_info *pcpu_rt;
1089
1090         rcu_read_lock();
1091         dev = ip6_rt_get_dev_rcu(rt);
1092         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1093         rcu_read_unlock();
1094         if (!pcpu_rt)
1095                 return NULL;
1096         ip6_rt_copy_init(pcpu_rt, rt);
1097         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1098         pcpu_rt->rt6i_flags |= RTF_PCPU;
1099         return pcpu_rt;
1100 }
1101
1102 /* It should be called with rcu_read_lock() acquired */
1103 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1104 {
1105         struct rt6_info *pcpu_rt, **p;
1106
1107         p = this_cpu_ptr(rt->rt6i_pcpu);
1108         pcpu_rt = *p;
1109
1110         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1111                 rt6_dst_from_metrics_check(pcpu_rt);
1112
1113         return pcpu_rt;
1114 }
1115
1116 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1117 {
1118         struct rt6_info *pcpu_rt, *prev, **p;
1119
1120         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1121         if (!pcpu_rt) {
1122                 struct net *net = dev_net(rt->dst.dev);
1123
1124                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1125                 return net->ipv6.ip6_null_entry;
1126         }
1127
1128         dst_hold(&pcpu_rt->dst);
1129         p = this_cpu_ptr(rt->rt6i_pcpu);
1130         prev = cmpxchg(p, NULL, pcpu_rt);
1131         BUG_ON(prev);
1132
1133         rt6_dst_from_metrics_check(pcpu_rt);
1134         return pcpu_rt;
1135 }
1136
1137 /* exception hash table implementation
1138  */
1139 static DEFINE_SPINLOCK(rt6_exception_lock);
1140
1141 /* Remove rt6_ex from hash table and free the memory
1142  * Caller must hold rt6_exception_lock
1143  */
1144 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1145                                  struct rt6_exception *rt6_ex)
1146 {
1147         struct net *net;
1148
1149         if (!bucket || !rt6_ex)
1150                 return;
1151
1152         net = dev_net(rt6_ex->rt6i->dst.dev);
1153         rt6_ex->rt6i->rt6i_node = NULL;
1154         hlist_del_rcu(&rt6_ex->hlist);
1155         rt6_release(rt6_ex->rt6i);
1156         kfree_rcu(rt6_ex, rcu);
1157         WARN_ON_ONCE(!bucket->depth);
1158         bucket->depth--;
1159         net->ipv6.rt6_stats->fib_rt_cache--;
1160 }
1161
1162 /* Remove oldest rt6_ex in bucket and free the memory
1163  * Caller must hold rt6_exception_lock
1164  */
1165 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1166 {
1167         struct rt6_exception *rt6_ex, *oldest = NULL;
1168
1169         if (!bucket)
1170                 return;
1171
1172         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1173                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1174                         oldest = rt6_ex;
1175         }
1176         rt6_remove_exception(bucket, oldest);
1177 }
1178
1179 static u32 rt6_exception_hash(const struct in6_addr *dst,
1180                               const struct in6_addr *src)
1181 {
1182         static u32 seed __read_mostly;
1183         u32 val;
1184
1185         net_get_random_once(&seed, sizeof(seed));
1186         val = jhash(dst, sizeof(*dst), seed);
1187
1188 #ifdef CONFIG_IPV6_SUBTREES
1189         if (src)
1190                 val = jhash(src, sizeof(*src), val);
1191 #endif
1192         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1193 }
1194
1195 /* Helper function to find the cached rt in the hash table
1196  * and update bucket pointer to point to the bucket for this
1197  * (daddr, saddr) pair
1198  * Caller must hold rt6_exception_lock
1199  */
1200 static struct rt6_exception *
1201 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1202                               const struct in6_addr *daddr,
1203                               const struct in6_addr *saddr)
1204 {
1205         struct rt6_exception *rt6_ex;
1206         u32 hval;
1207
1208         if (!(*bucket) || !daddr)
1209                 return NULL;
1210
1211         hval = rt6_exception_hash(daddr, saddr);
1212         *bucket += hval;
1213
1214         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1215                 struct rt6_info *rt6 = rt6_ex->rt6i;
1216                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1217
1218 #ifdef CONFIG_IPV6_SUBTREES
1219                 if (matched && saddr)
1220                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1221 #endif
1222                 if (matched)
1223                         return rt6_ex;
1224         }
1225         return NULL;
1226 }
1227
1228 /* Helper function to find the cached rt in the hash table
1229  * and update bucket pointer to point to the bucket for this
1230  * (daddr, saddr) pair
1231  * Caller must hold rcu_read_lock()
1232  */
1233 static struct rt6_exception *
1234 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1235                          const struct in6_addr *daddr,
1236                          const struct in6_addr *saddr)
1237 {
1238         struct rt6_exception *rt6_ex;
1239         u32 hval;
1240
1241         WARN_ON_ONCE(!rcu_read_lock_held());
1242
1243         if (!(*bucket) || !daddr)
1244                 return NULL;
1245
1246         hval = rt6_exception_hash(daddr, saddr);
1247         *bucket += hval;
1248
1249         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1250                 struct rt6_info *rt6 = rt6_ex->rt6i;
1251                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1252
1253 #ifdef CONFIG_IPV6_SUBTREES
1254                 if (matched && saddr)
1255                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1256 #endif
1257                 if (matched)
1258                         return rt6_ex;
1259         }
1260         return NULL;
1261 }
1262
1263 static int rt6_insert_exception(struct rt6_info *nrt,
1264                                 struct rt6_info *ort)
1265 {
1266         struct net *net = dev_net(ort->dst.dev);
1267         struct rt6_exception_bucket *bucket;
1268         struct in6_addr *src_key = NULL;
1269         struct rt6_exception *rt6_ex;
1270         int err = 0;
1271
1272         /* ort can't be a cache or pcpu route */
1273         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1274                 ort = ort->from;
1275         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1276
1277         spin_lock_bh(&rt6_exception_lock);
1278
1279         if (ort->exception_bucket_flushed) {
1280                 err = -EINVAL;
1281                 goto out;
1282         }
1283
1284         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1285                                         lockdep_is_held(&rt6_exception_lock));
1286         if (!bucket) {
1287                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1288                                  GFP_ATOMIC);
1289                 if (!bucket) {
1290                         err = -ENOMEM;
1291                         goto out;
1292                 }
1293                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1294         }
1295
1296 #ifdef CONFIG_IPV6_SUBTREES
1297         /* rt6i_src.plen != 0 indicates ort is in subtree
1298          * and exception table is indexed by a hash of
1299          * both rt6i_dst and rt6i_src.
1300          * Otherwise, the exception table is indexed by
1301          * a hash of only rt6i_dst.
1302          */
1303         if (ort->rt6i_src.plen)
1304                 src_key = &nrt->rt6i_src.addr;
1305 #endif
1306
1307         /* Update rt6i_prefsrc as it could be changed
1308          * in rt6_remove_prefsrc()
1309          */
1310         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1311         /* rt6_mtu_change() might lower mtu on ort.
1312          * Only insert this exception route if its mtu
1313          * is less than ort's mtu value.
1314          */
1315         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1316                 err = -EINVAL;
1317                 goto out;
1318         }
1319
1320         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1321                                                src_key);
1322         if (rt6_ex)
1323                 rt6_remove_exception(bucket, rt6_ex);
1324
1325         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1326         if (!rt6_ex) {
1327                 err = -ENOMEM;
1328                 goto out;
1329         }
1330         rt6_ex->rt6i = nrt;
1331         rt6_ex->stamp = jiffies;
1332         atomic_inc(&nrt->rt6i_ref);
1333         nrt->rt6i_node = ort->rt6i_node;
1334         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1335         bucket->depth++;
1336         net->ipv6.rt6_stats->fib_rt_cache++;
1337
1338         if (bucket->depth > FIB6_MAX_DEPTH)
1339                 rt6_exception_remove_oldest(bucket);
1340
1341 out:
1342         spin_unlock_bh(&rt6_exception_lock);
1343
1344         /* Update fn->fn_sernum to invalidate all cached dst */
1345         if (!err) {
1346                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1347                 fib6_update_sernum(ort);
1348                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1349                 fib6_force_start_gc(net);
1350         }
1351
1352         return err;
1353 }
1354
1355 void rt6_flush_exceptions(struct rt6_info *rt)
1356 {
1357         struct rt6_exception_bucket *bucket;
1358         struct rt6_exception *rt6_ex;
1359         struct hlist_node *tmp;
1360         int i;
1361
1362         spin_lock_bh(&rt6_exception_lock);
1363         /* Prevent rt6_insert_exception() to recreate the bucket list */
1364         rt->exception_bucket_flushed = 1;
1365
1366         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1367                                     lockdep_is_held(&rt6_exception_lock));
1368         if (!bucket)
1369                 goto out;
1370
1371         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1372                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1373                         rt6_remove_exception(bucket, rt6_ex);
1374                 WARN_ON_ONCE(bucket->depth);
1375                 bucket++;
1376         }
1377
1378 out:
1379         spin_unlock_bh(&rt6_exception_lock);
1380 }
1381
1382 /* Find cached rt in the hash table inside passed in rt
1383  * Caller has to hold rcu_read_lock()
1384  */
1385 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1386                                            struct in6_addr *daddr,
1387                                            struct in6_addr *saddr)
1388 {
1389         struct rt6_exception_bucket *bucket;
1390         struct in6_addr *src_key = NULL;
1391         struct rt6_exception *rt6_ex;
1392         struct rt6_info *res = NULL;
1393
1394         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1395
1396 #ifdef CONFIG_IPV6_SUBTREES
1397         /* rt6i_src.plen != 0 indicates rt is in subtree
1398          * and exception table is indexed by a hash of
1399          * both rt6i_dst and rt6i_src.
1400          * Otherwise, the exception table is indexed by
1401          * a hash of only rt6i_dst.
1402          */
1403         if (rt->rt6i_src.plen)
1404                 src_key = saddr;
1405 #endif
1406         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1407
1408         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1409                 res = rt6_ex->rt6i;
1410
1411         return res;
1412 }
1413
1414 /* Remove the passed in cached rt from the hash table that contains it */
1415 int rt6_remove_exception_rt(struct rt6_info *rt)
1416 {
1417         struct rt6_exception_bucket *bucket;
1418         struct rt6_info *from = rt->from;
1419         struct in6_addr *src_key = NULL;
1420         struct rt6_exception *rt6_ex;
1421         int err;
1422
1423         if (!from ||
1424             !(rt->rt6i_flags & RTF_CACHE))
1425                 return -EINVAL;
1426
1427         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1428                 return -ENOENT;
1429
1430         spin_lock_bh(&rt6_exception_lock);
1431         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1432                                     lockdep_is_held(&rt6_exception_lock));
1433 #ifdef CONFIG_IPV6_SUBTREES
1434         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1435          * and exception table is indexed by a hash of
1436          * both rt6i_dst and rt6i_src.
1437          * Otherwise, the exception table is indexed by
1438          * a hash of only rt6i_dst.
1439          */
1440         if (from->rt6i_src.plen)
1441                 src_key = &rt->rt6i_src.addr;
1442 #endif
1443         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1444                                                &rt->rt6i_dst.addr,
1445                                                src_key);
1446         if (rt6_ex) {
1447                 rt6_remove_exception(bucket, rt6_ex);
1448                 err = 0;
1449         } else {
1450                 err = -ENOENT;
1451         }
1452
1453         spin_unlock_bh(&rt6_exception_lock);
1454         return err;
1455 }
1456
1457 /* Find rt6_ex which contains the passed in rt cache and
1458  * refresh its stamp
1459  */
1460 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1461 {
1462         struct rt6_exception_bucket *bucket;
1463         struct rt6_info *from = rt->from;
1464         struct in6_addr *src_key = NULL;
1465         struct rt6_exception *rt6_ex;
1466
1467         if (!from ||
1468             !(rt->rt6i_flags & RTF_CACHE))
1469                 return;
1470
1471         rcu_read_lock();
1472         bucket = rcu_dereference(from->rt6i_exception_bucket);
1473
1474 #ifdef CONFIG_IPV6_SUBTREES
1475         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1476          * and exception table is indexed by a hash of
1477          * both rt6i_dst and rt6i_src.
1478          * Otherwise, the exception table is indexed by
1479          * a hash of only rt6i_dst.
1480          */
1481         if (from->rt6i_src.plen)
1482                 src_key = &rt->rt6i_src.addr;
1483 #endif
1484         rt6_ex = __rt6_find_exception_rcu(&bucket,
1485                                           &rt->rt6i_dst.addr,
1486                                           src_key);
1487         if (rt6_ex)
1488                 rt6_ex->stamp = jiffies;
1489
1490         rcu_read_unlock();
1491 }
1492
1493 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         int i;
1498
1499         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500                                         lockdep_is_held(&rt6_exception_lock));
1501
1502         if (bucket) {
1503                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1504                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1505                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1506                         }
1507                         bucket++;
1508                 }
1509         }
1510 }
1511
1512 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1513 {
1514         struct rt6_exception_bucket *bucket;
1515         struct rt6_exception *rt6_ex;
1516         int i;
1517
1518         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519                                         lockdep_is_held(&rt6_exception_lock));
1520
1521         if (bucket) {
1522                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1523                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1524                                 struct rt6_info *entry = rt6_ex->rt6i;
1525                                 /* For RTF_CACHE with rt6i_pmtu == 0
1526                                  * (i.e. a redirected route),
1527                                  * the metrics of its rt->dst.from has already
1528                                  * been updated.
1529                                  */
1530                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1531                                         entry->rt6i_pmtu = mtu;
1532                         }
1533                         bucket++;
1534                 }
1535         }
1536 }
1537
1538 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1539
1540 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1541                                         struct in6_addr *gateway)
1542 {
1543         struct rt6_exception_bucket *bucket;
1544         struct rt6_exception *rt6_ex;
1545         struct hlist_node *tmp;
1546         int i;
1547
1548         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1549                 return;
1550
1551         spin_lock_bh(&rt6_exception_lock);
1552         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1553                                      lockdep_is_held(&rt6_exception_lock));
1554
1555         if (bucket) {
1556                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1557                         hlist_for_each_entry_safe(rt6_ex, tmp,
1558                                                   &bucket->chain, hlist) {
1559                                 struct rt6_info *entry = rt6_ex->rt6i;
1560
1561                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1562                                     RTF_CACHE_GATEWAY &&
1563                                     ipv6_addr_equal(gateway,
1564                                                     &entry->rt6i_gateway)) {
1565                                         rt6_remove_exception(bucket, rt6_ex);
1566                                 }
1567                         }
1568                         bucket++;
1569                 }
1570         }
1571
1572         spin_unlock_bh(&rt6_exception_lock);
1573 }
1574
1575 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1576                                       struct rt6_exception *rt6_ex,
1577                                       struct fib6_gc_args *gc_args,
1578                                       unsigned long now)
1579 {
1580         struct rt6_info *rt = rt6_ex->rt6i;
1581
1582         /* we are pruning and obsoleting aged-out and non gateway exceptions
1583          * even if others have still references to them, so that on next
1584          * dst_check() such references can be dropped.
1585          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1586          * expired, independently from their aging, as per RFC 8201 section 4
1587          */
1588         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1589                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1590                         RT6_TRACE("aging clone %p\n", rt);
1591                         rt6_remove_exception(bucket, rt6_ex);
1592                         return;
1593                 }
1594         } else if (time_after(jiffies, rt->dst.expires)) {
1595                 RT6_TRACE("purging expired route %p\n", rt);
1596                 rt6_remove_exception(bucket, rt6_ex);
1597                 return;
1598         }
1599
1600         if (rt->rt6i_flags & RTF_GATEWAY) {
1601                 struct neighbour *neigh;
1602                 __u8 neigh_flags = 0;
1603
1604                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1605                 if (neigh) {
1606                         neigh_flags = neigh->flags;
1607                         neigh_release(neigh);
1608                 }
1609                 if (!(neigh_flags & NTF_ROUTER)) {
1610                         RT6_TRACE("purging route %p via non-router but gateway\n",
1611                                   rt);
1612                         rt6_remove_exception(bucket, rt6_ex);
1613                         return;
1614                 }
1615         }
1616
1617         gc_args->more++;
1618 }
1619
1620 void rt6_age_exceptions(struct rt6_info *rt,
1621                         struct fib6_gc_args *gc_args,
1622                         unsigned long now)
1623 {
1624         struct rt6_exception_bucket *bucket;
1625         struct rt6_exception *rt6_ex;
1626         struct hlist_node *tmp;
1627         int i;
1628
1629         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1630                 return;
1631
1632         spin_lock_bh(&rt6_exception_lock);
1633         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1634                                     lockdep_is_held(&rt6_exception_lock));
1635
1636         if (bucket) {
1637                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1638                         hlist_for_each_entry_safe(rt6_ex, tmp,
1639                                                   &bucket->chain, hlist) {
1640                                 rt6_age_examine_exception(bucket, rt6_ex,
1641                                                           gc_args, now);
1642                         }
1643                         bucket++;
1644                 }
1645         }
1646         spin_unlock_bh(&rt6_exception_lock);
1647 }
1648
1649 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1650                                int oif, struct flowi6 *fl6, int flags)
1651 {
1652         struct fib6_node *fn, *saved_fn;
1653         struct rt6_info *rt, *rt_cache;
1654         int strict = 0;
1655
1656         strict |= flags & RT6_LOOKUP_F_IFACE;
1657         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1658         if (net->ipv6.devconf_all->forwarding == 0)
1659                 strict |= RT6_LOOKUP_F_REACHABLE;
1660
1661         rcu_read_lock();
1662
1663         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1664         saved_fn = fn;
1665
1666         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1667                 oif = 0;
1668
1669 redo_rt6_select:
1670         rt = rt6_select(net, fn, oif, strict);
1671         if (rt->rt6i_nsiblings)
1672                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1673         if (rt == net->ipv6.ip6_null_entry) {
1674                 fn = fib6_backtrack(fn, &fl6->saddr);
1675                 if (fn)
1676                         goto redo_rt6_select;
1677                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1678                         /* also consider unreachable route */
1679                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1680                         fn = saved_fn;
1681                         goto redo_rt6_select;
1682                 }
1683         }
1684
1685         /*Search through exception table */
1686         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1687         if (rt_cache)
1688                 rt = rt_cache;
1689
1690         if (rt == net->ipv6.ip6_null_entry) {
1691                 rcu_read_unlock();
1692                 dst_hold(&rt->dst);
1693                 trace_fib6_table_lookup(net, rt, table, fl6);
1694                 return rt;
1695         } else if (rt->rt6i_flags & RTF_CACHE) {
1696                 if (ip6_hold_safe(net, &rt, true)) {
1697                         dst_use_noref(&rt->dst, jiffies);
1698                         rt6_dst_from_metrics_check(rt);
1699                 }
1700                 rcu_read_unlock();
1701                 trace_fib6_table_lookup(net, rt, table, fl6);
1702                 return rt;
1703         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1704                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1705                 /* Create a RTF_CACHE clone which will not be
1706                  * owned by the fib6 tree.  It is for the special case where
1707                  * the daddr in the skb during the neighbor look-up is different
1708                  * from the fl6->daddr used to look-up route here.
1709                  */
1710
1711                 struct rt6_info *uncached_rt;
1712
1713                 if (ip6_hold_safe(net, &rt, true)) {
1714                         dst_use_noref(&rt->dst, jiffies);
1715                 } else {
1716                         rcu_read_unlock();
1717                         uncached_rt = rt;
1718                         goto uncached_rt_out;
1719                 }
1720                 rcu_read_unlock();
1721
1722                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1723                 dst_release(&rt->dst);
1724
1725                 if (uncached_rt) {
1726                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1727                          * No need for another dst_hold()
1728                          */
1729                         rt6_uncached_list_add(uncached_rt);
1730                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1731                 } else {
1732                         uncached_rt = net->ipv6.ip6_null_entry;
1733                         dst_hold(&uncached_rt->dst);
1734                 }
1735
1736 uncached_rt_out:
1737                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1738                 return uncached_rt;
1739
1740         } else {
1741                 /* Get a percpu copy */
1742
1743                 struct rt6_info *pcpu_rt;
1744
1745                 dst_use_noref(&rt->dst, jiffies);
1746                 local_bh_disable();
1747                 pcpu_rt = rt6_get_pcpu_route(rt);
1748
1749                 if (!pcpu_rt) {
1750                         /* atomic_inc_not_zero() is needed when using rcu */
1751                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1752                                 /* No dst_hold() on rt is needed because grabbing
1753                                  * rt->rt6i_ref makes sure rt can't be released.
1754                                  */
1755                                 pcpu_rt = rt6_make_pcpu_route(rt);
1756                                 rt6_release(rt);
1757                         } else {
1758                                 /* rt is already removed from tree */
1759                                 pcpu_rt = net->ipv6.ip6_null_entry;
1760                                 dst_hold(&pcpu_rt->dst);
1761                         }
1762                 }
1763                 local_bh_enable();
1764                 rcu_read_unlock();
1765                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1766                 return pcpu_rt;
1767         }
1768 }
1769 EXPORT_SYMBOL_GPL(ip6_pol_route);
1770
1771 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1772                                             struct flowi6 *fl6, int flags)
1773 {
1774         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1775 }
1776
1777 struct dst_entry *ip6_route_input_lookup(struct net *net,
1778                                          struct net_device *dev,
1779                                          struct flowi6 *fl6, int flags)
1780 {
1781         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1782                 flags |= RT6_LOOKUP_F_IFACE;
1783
1784         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1785 }
1786 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1787
1788 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1789                                   struct flow_keys *keys)
1790 {
1791         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1792         const struct ipv6hdr *key_iph = outer_iph;
1793         const struct ipv6hdr *inner_iph;
1794         const struct icmp6hdr *icmph;
1795         struct ipv6hdr _inner_iph;
1796
1797         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1798                 goto out;
1799
1800         icmph = icmp6_hdr(skb);
1801         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1802             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1803             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1804             icmph->icmp6_type != ICMPV6_PARAMPROB)
1805                 goto out;
1806
1807         inner_iph = skb_header_pointer(skb,
1808                                        skb_transport_offset(skb) + sizeof(*icmph),
1809                                        sizeof(_inner_iph), &_inner_iph);
1810         if (!inner_iph)
1811                 goto out;
1812
1813         key_iph = inner_iph;
1814 out:
1815         memset(keys, 0, sizeof(*keys));
1816         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1817         keys->addrs.v6addrs.src = key_iph->saddr;
1818         keys->addrs.v6addrs.dst = key_iph->daddr;
1819         keys->tags.flow_label = ip6_flowinfo(key_iph);
1820         keys->basic.ip_proto = key_iph->nexthdr;
1821 }
1822
1823 /* if skb is set it will be used and fl6 can be NULL */
1824 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1825 {
1826         struct flow_keys hash_keys;
1827
1828         if (skb) {
1829                 ip6_multipath_l3_keys(skb, &hash_keys);
1830                 return flow_hash_from_keys(&hash_keys) >> 1;
1831         }
1832
1833         return get_hash_from_flowi6(fl6) >> 1;
1834 }
1835
1836 void ip6_route_input(struct sk_buff *skb)
1837 {
1838         const struct ipv6hdr *iph = ipv6_hdr(skb);
1839         struct net *net = dev_net(skb->dev);
1840         int flags = RT6_LOOKUP_F_HAS_SADDR;
1841         struct ip_tunnel_info *tun_info;
1842         struct flowi6 fl6 = {
1843                 .flowi6_iif = skb->dev->ifindex,
1844                 .daddr = iph->daddr,
1845                 .saddr = iph->saddr,
1846                 .flowlabel = ip6_flowinfo(iph),
1847                 .flowi6_mark = skb->mark,
1848                 .flowi6_proto = iph->nexthdr,
1849         };
1850
1851         tun_info = skb_tunnel_info(skb);
1852         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1853                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1854         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1855                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1856         skb_dst_drop(skb);
1857         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1858 }
1859
1860 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1861                                              struct flowi6 *fl6, int flags)
1862 {
1863         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1864 }
1865
1866 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1867                                          struct flowi6 *fl6, int flags)
1868 {
1869         bool any_src;
1870
1871         if (rt6_need_strict(&fl6->daddr)) {
1872                 struct dst_entry *dst;
1873
1874                 dst = l3mdev_link_scope_lookup(net, fl6);
1875                 if (dst)
1876                         return dst;
1877         }
1878
1879         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1880
1881         any_src = ipv6_addr_any(&fl6->saddr);
1882         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1883             (fl6->flowi6_oif && any_src))
1884                 flags |= RT6_LOOKUP_F_IFACE;
1885
1886         if (!any_src)
1887                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1888         else if (sk)
1889                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1890
1891         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1892 }
1893 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1894
1895 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1896 {
1897         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1898         struct net_device *loopback_dev = net->loopback_dev;
1899         struct dst_entry *new = NULL;
1900
1901         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1902                        DST_OBSOLETE_DEAD, 0);
1903         if (rt) {
1904                 rt6_info_init(rt);
1905                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1906
1907                 new = &rt->dst;
1908                 new->__use = 1;
1909                 new->input = dst_discard;
1910                 new->output = dst_discard_out;
1911
1912                 dst_copy_metrics(new, &ort->dst);
1913
1914                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1915                 rt->rt6i_gateway = ort->rt6i_gateway;
1916                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1917                 rt->rt6i_metric = 0;
1918
1919                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1920 #ifdef CONFIG_IPV6_SUBTREES
1921                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1922 #endif
1923         }
1924
1925         dst_release(dst_orig);
1926         return new ? new : ERR_PTR(-ENOMEM);
1927 }
1928
1929 /*
1930  *      Destination cache support functions
1931  */
1932
1933 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1934 {
1935         if (rt->from &&
1936             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1937                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1938 }
1939
1940 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1941 {
1942         u32 rt_cookie = 0;
1943
1944         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1945                 return NULL;
1946
1947         if (rt6_check_expired(rt))
1948                 return NULL;
1949
1950         return &rt->dst;
1951 }
1952
1953 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1954 {
1955         if (!__rt6_check_expired(rt) &&
1956             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1957             rt6_check(rt->from, cookie))
1958                 return &rt->dst;
1959         else
1960                 return NULL;
1961 }
1962
1963 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1964 {
1965         struct rt6_info *rt;
1966
1967         rt = (struct rt6_info *) dst;
1968
1969         /* All IPV6 dsts are created with ->obsolete set to the value
1970          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1971          * into this function always.
1972          */
1973
1974         rt6_dst_from_metrics_check(rt);
1975
1976         if (rt->rt6i_flags & RTF_PCPU ||
1977             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
1978                 return rt6_dst_from_check(rt, cookie);
1979         else
1980                 return rt6_check(rt, cookie);
1981 }
1982
1983 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1984 {
1985         struct rt6_info *rt = (struct rt6_info *) dst;
1986
1987         if (rt) {
1988                 if (rt->rt6i_flags & RTF_CACHE) {
1989                         if (rt6_check_expired(rt)) {
1990                                 ip6_del_rt(rt);
1991                                 dst = NULL;
1992                         }
1993                 } else {
1994                         dst_release(dst);
1995                         dst = NULL;
1996                 }
1997         }
1998         return dst;
1999 }
2000
2001 static void ip6_link_failure(struct sk_buff *skb)
2002 {
2003         struct rt6_info *rt;
2004
2005         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2006
2007         rt = (struct rt6_info *) skb_dst(skb);
2008         if (rt) {
2009                 if (rt->rt6i_flags & RTF_CACHE) {
2010                         if (dst_hold_safe(&rt->dst))
2011                                 ip6_del_rt(rt);
2012                 } else {
2013                         struct fib6_node *fn;
2014
2015                         rcu_read_lock();
2016                         fn = rcu_dereference(rt->rt6i_node);
2017                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2018                                 fn->fn_sernum = -1;
2019                         rcu_read_unlock();
2020                 }
2021         }
2022 }
2023
2024 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2025 {
2026         struct net *net = dev_net(rt->dst.dev);
2027
2028         rt->rt6i_flags |= RTF_MODIFIED;
2029         rt->rt6i_pmtu = mtu;
2030         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2031 }
2032
2033 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2034 {
2035         return !(rt->rt6i_flags & RTF_CACHE) &&
2036                 (rt->rt6i_flags & RTF_PCPU ||
2037                  rcu_access_pointer(rt->rt6i_node));
2038 }
2039
2040 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2041                                  const struct ipv6hdr *iph, u32 mtu)
2042 {
2043         const struct in6_addr *daddr, *saddr;
2044         struct rt6_info *rt6 = (struct rt6_info *)dst;
2045
2046         if (rt6->rt6i_flags & RTF_LOCAL)
2047                 return;
2048
2049         if (dst_metric_locked(dst, RTAX_MTU))
2050                 return;
2051
2052         if (iph) {
2053                 daddr = &iph->daddr;
2054                 saddr = &iph->saddr;
2055         } else if (sk) {
2056                 daddr = &sk->sk_v6_daddr;
2057                 saddr = &inet6_sk(sk)->saddr;
2058         } else {
2059                 daddr = NULL;
2060                 saddr = NULL;
2061         }
2062         dst_confirm_neigh(dst, daddr);
2063         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2064         if (mtu >= dst_mtu(dst))
2065                 return;
2066
2067         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2068                 rt6_do_update_pmtu(rt6, mtu);
2069                 /* update rt6_ex->stamp for cache */
2070                 if (rt6->rt6i_flags & RTF_CACHE)
2071                         rt6_update_exception_stamp_rt(rt6);
2072         } else if (daddr) {
2073                 struct rt6_info *nrt6;
2074
2075                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2076                 if (nrt6) {
2077                         rt6_do_update_pmtu(nrt6, mtu);
2078                         if (rt6_insert_exception(nrt6, rt6))
2079                                 dst_release_immediate(&nrt6->dst);
2080                 }
2081         }
2082 }
2083
2084 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2085                                struct sk_buff *skb, u32 mtu)
2086 {
2087         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2088 }
2089
2090 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2091                      int oif, u32 mark, kuid_t uid)
2092 {
2093         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2094         struct dst_entry *dst;
2095         struct flowi6 fl6;
2096
2097         memset(&fl6, 0, sizeof(fl6));
2098         fl6.flowi6_oif = oif;
2099         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2100         fl6.daddr = iph->daddr;
2101         fl6.saddr = iph->saddr;
2102         fl6.flowlabel = ip6_flowinfo(iph);
2103         fl6.flowi6_uid = uid;
2104
2105         dst = ip6_route_output(net, NULL, &fl6);
2106         if (!dst->error)
2107                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2108         dst_release(dst);
2109 }
2110 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2111
2112 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2113 {
2114         struct dst_entry *dst;
2115
2116         ip6_update_pmtu(skb, sock_net(sk), mtu,
2117                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2118
2119         dst = __sk_dst_get(sk);
2120         if (!dst || !dst->obsolete ||
2121             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2122                 return;
2123
2124         bh_lock_sock(sk);
2125         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2126                 ip6_datagram_dst_update(sk, false);
2127         bh_unlock_sock(sk);
2128 }
2129 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2130
2131 /* Handle redirects */
2132 struct ip6rd_flowi {
2133         struct flowi6 fl6;
2134         struct in6_addr gateway;
2135 };
2136
2137 static struct rt6_info *__ip6_route_redirect(struct net *net,
2138                                              struct fib6_table *table,
2139                                              struct flowi6 *fl6,
2140                                              int flags)
2141 {
2142         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2143         struct rt6_info *rt, *rt_cache;
2144         struct fib6_node *fn;
2145
2146         /* Get the "current" route for this destination and
2147          * check if the redirect has come from appropriate router.
2148          *
2149          * RFC 4861 specifies that redirects should only be
2150          * accepted if they come from the nexthop to the target.
2151          * Due to the way the routes are chosen, this notion
2152          * is a bit fuzzy and one might need to check all possible
2153          * routes.
2154          */
2155
2156         rcu_read_lock();
2157         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2158 restart:
2159         for_each_fib6_node_rt_rcu(fn) {
2160                 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2161                         continue;
2162                 if (rt6_check_expired(rt))
2163                         continue;
2164                 if (rt->dst.error)
2165                         break;
2166                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2167                         continue;
2168                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2169                         continue;
2170                 /* rt_cache's gateway might be different from its 'parent'
2171                  * in the case of an ip redirect.
2172                  * So we keep searching in the exception table if the gateway
2173                  * is different.
2174                  */
2175                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2176                         rt_cache = rt6_find_cached_rt(rt,
2177                                                       &fl6->daddr,
2178                                                       &fl6->saddr);
2179                         if (rt_cache &&
2180                             ipv6_addr_equal(&rdfl->gateway,
2181                                             &rt_cache->rt6i_gateway)) {
2182                                 rt = rt_cache;
2183                                 break;
2184                         }
2185                         continue;
2186                 }
2187                 break;
2188         }
2189
2190         if (!rt)
2191                 rt = net->ipv6.ip6_null_entry;
2192         else if (rt->dst.error) {
2193                 rt = net->ipv6.ip6_null_entry;
2194                 goto out;
2195         }
2196
2197         if (rt == net->ipv6.ip6_null_entry) {
2198                 fn = fib6_backtrack(fn, &fl6->saddr);
2199                 if (fn)
2200                         goto restart;
2201         }
2202
2203 out:
2204         ip6_hold_safe(net, &rt, true);
2205
2206         rcu_read_unlock();
2207
2208         trace_fib6_table_lookup(net, rt, table, fl6);
2209         return rt;
2210 };
2211
2212 static struct dst_entry *ip6_route_redirect(struct net *net,
2213                                         const struct flowi6 *fl6,
2214                                         const struct in6_addr *gateway)
2215 {
2216         int flags = RT6_LOOKUP_F_HAS_SADDR;
2217         struct ip6rd_flowi rdfl;
2218
2219         rdfl.fl6 = *fl6;
2220         rdfl.gateway = *gateway;
2221
2222         return fib6_rule_lookup(net, &rdfl.fl6,
2223                                 flags, __ip6_route_redirect);
2224 }
2225
2226 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2227                   kuid_t uid)
2228 {
2229         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2230         struct dst_entry *dst;
2231         struct flowi6 fl6;
2232
2233         memset(&fl6, 0, sizeof(fl6));
2234         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2235         fl6.flowi6_oif = oif;
2236         fl6.flowi6_mark = mark;
2237         fl6.daddr = iph->daddr;
2238         fl6.saddr = iph->saddr;
2239         fl6.flowlabel = ip6_flowinfo(iph);
2240         fl6.flowi6_uid = uid;
2241
2242         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2243         rt6_do_redirect(dst, NULL, skb);
2244         dst_release(dst);
2245 }
2246 EXPORT_SYMBOL_GPL(ip6_redirect);
2247
2248 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2249                             u32 mark)
2250 {
2251         const struct ipv6hdr *iph = ipv6_hdr(skb);
2252         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2253         struct dst_entry *dst;
2254         struct flowi6 fl6;
2255
2256         memset(&fl6, 0, sizeof(fl6));
2257         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2258         fl6.flowi6_oif = oif;
2259         fl6.flowi6_mark = mark;
2260         fl6.daddr = msg->dest;
2261         fl6.saddr = iph->daddr;
2262         fl6.flowi6_uid = sock_net_uid(net, NULL);
2263
2264         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2265         rt6_do_redirect(dst, NULL, skb);
2266         dst_release(dst);
2267 }
2268
2269 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2270 {
2271         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2272                      sk->sk_uid);
2273 }
2274 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2275
2276 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2277 {
2278         struct net_device *dev = dst->dev;
2279         unsigned int mtu = dst_mtu(dst);
2280         struct net *net = dev_net(dev);
2281
2282         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2283
2284         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2285                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2286
2287         /*
2288          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2289          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2290          * IPV6_MAXPLEN is also valid and means: "any MSS,
2291          * rely only on pmtu discovery"
2292          */
2293         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2294                 mtu = IPV6_MAXPLEN;
2295         return mtu;
2296 }
2297
2298 static unsigned int ip6_mtu(const struct dst_entry *dst)
2299 {
2300         const struct rt6_info *rt = (const struct rt6_info *)dst;
2301         unsigned int mtu = rt->rt6i_pmtu;
2302         struct inet6_dev *idev;
2303
2304         if (mtu)
2305                 goto out;
2306
2307         mtu = dst_metric_raw(dst, RTAX_MTU);
2308         if (mtu)
2309                 goto out;
2310
2311         mtu = IPV6_MIN_MTU;
2312
2313         rcu_read_lock();
2314         idev = __in6_dev_get(dst->dev);
2315         if (idev)
2316                 mtu = idev->cnf.mtu6;
2317         rcu_read_unlock();
2318
2319 out:
2320         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2321
2322         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2323 }
2324
2325 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2326                                   struct flowi6 *fl6)
2327 {
2328         struct dst_entry *dst;
2329         struct rt6_info *rt;
2330         struct inet6_dev *idev = in6_dev_get(dev);
2331         struct net *net = dev_net(dev);
2332
2333         if (unlikely(!idev))
2334                 return ERR_PTR(-ENODEV);
2335
2336         rt = ip6_dst_alloc(net, dev, 0);
2337         if (unlikely(!rt)) {
2338                 in6_dev_put(idev);
2339                 dst = ERR_PTR(-ENOMEM);
2340                 goto out;
2341         }
2342
2343         rt->dst.flags |= DST_HOST;
2344         rt->dst.input = ip6_input;
2345         rt->dst.output  = ip6_output;
2346         rt->rt6i_gateway  = fl6->daddr;
2347         rt->rt6i_dst.addr = fl6->daddr;
2348         rt->rt6i_dst.plen = 128;
2349         rt->rt6i_idev     = idev;
2350         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2351
2352         /* Add this dst into uncached_list so that rt6_disable_ip() can
2353          * do proper release of the net_device
2354          */
2355         rt6_uncached_list_add(rt);
2356         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2357
2358         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2359
2360 out:
2361         return dst;
2362 }
2363
2364 static int ip6_dst_gc(struct dst_ops *ops)
2365 {
2366         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2367         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2368         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2369         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2370         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2371         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2372         int entries;
2373
2374         entries = dst_entries_get_fast(ops);
2375         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2376             entries <= rt_max_size)
2377                 goto out;
2378
2379         net->ipv6.ip6_rt_gc_expire++;
2380         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2381         entries = dst_entries_get_slow(ops);
2382         if (entries < ops->gc_thresh)
2383                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2384 out:
2385         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2386         return entries > rt_max_size;
2387 }
2388
2389 static int ip6_convert_metrics(struct mx6_config *mxc,
2390                                const struct fib6_config *cfg)
2391 {
2392         struct net *net = cfg->fc_nlinfo.nl_net;
2393         bool ecn_ca = false;
2394         struct nlattr *nla;
2395         int remaining;
2396         u32 *mp;
2397
2398         if (!cfg->fc_mx)
2399                 return 0;
2400
2401         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2402         if (unlikely(!mp))
2403                 return -ENOMEM;
2404
2405         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2406                 int type = nla_type(nla);
2407                 u32 val;
2408
2409                 if (!type)
2410                         continue;
2411                 if (unlikely(type > RTAX_MAX))
2412                         goto err;
2413
2414                 if (type == RTAX_CC_ALGO) {
2415                         char tmp[TCP_CA_NAME_MAX];
2416
2417                         nla_strlcpy(tmp, nla, sizeof(tmp));
2418                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2419                         if (val == TCP_CA_UNSPEC)
2420                                 goto err;
2421                 } else {
2422                         val = nla_get_u32(nla);
2423                 }
2424                 if (type == RTAX_HOPLIMIT && val > 255)
2425                         val = 255;
2426                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2427                         goto err;
2428
2429                 mp[type - 1] = val;
2430                 __set_bit(type - 1, mxc->mx_valid);
2431         }
2432
2433         if (ecn_ca) {
2434                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2435                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2436         }
2437
2438         mxc->mx = mp;
2439         return 0;
2440  err:
2441         kfree(mp);
2442         return -EINVAL;
2443 }
2444
2445 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2446                                             struct fib6_config *cfg,
2447                                             const struct in6_addr *gw_addr,
2448                                             u32 tbid, int flags)
2449 {
2450         struct flowi6 fl6 = {
2451                 .flowi6_oif = cfg->fc_ifindex,
2452                 .daddr = *gw_addr,
2453                 .saddr = cfg->fc_prefsrc,
2454         };
2455         struct fib6_table *table;
2456         struct rt6_info *rt;
2457
2458         table = fib6_get_table(net, tbid);
2459         if (!table)
2460                 return NULL;
2461
2462         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2463                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2464
2465         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2466         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2467
2468         /* if table lookup failed, fall back to full lookup */
2469         if (rt == net->ipv6.ip6_null_entry) {
2470                 ip6_rt_put(rt);
2471                 rt = NULL;
2472         }
2473
2474         return rt;
2475 }
2476
2477 static int ip6_route_check_nh_onlink(struct net *net,
2478                                      struct fib6_config *cfg,
2479                                      struct net_device *dev,
2480                                      struct netlink_ext_ack *extack)
2481 {
2482         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
2483         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2484         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2485         struct rt6_info *grt;
2486         int err;
2487
2488         err = 0;
2489         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2490         if (grt) {
2491                 if (grt->rt6i_flags & flags || dev != grt->dst.dev) {
2492                         NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
2493                         err = -EINVAL;
2494                 }
2495
2496                 ip6_rt_put(grt);
2497         }
2498
2499         return err;
2500 }
2501
2502 static int ip6_route_check_nh(struct net *net,
2503                               struct fib6_config *cfg,
2504                               struct net_device **_dev,
2505                               struct inet6_dev **idev)
2506 {
2507         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2508         struct net_device *dev = _dev ? *_dev : NULL;
2509         struct rt6_info *grt = NULL;
2510         int err = -EHOSTUNREACH;
2511
2512         if (cfg->fc_table) {
2513                 int flags = RT6_LOOKUP_F_IFACE;
2514
2515                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2516                                           cfg->fc_table, flags);
2517                 if (grt) {
2518                         if (grt->rt6i_flags & RTF_GATEWAY ||
2519                             (dev && dev != grt->dst.dev)) {
2520                                 ip6_rt_put(grt);
2521                                 grt = NULL;
2522                         }
2523                 }
2524         }
2525
2526         if (!grt)
2527                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
2528
2529         if (!grt)
2530                 goto out;
2531
2532         if (dev) {
2533                 if (dev != grt->dst.dev) {
2534                         ip6_rt_put(grt);
2535                         goto out;
2536                 }
2537         } else {
2538                 *_dev = dev = grt->dst.dev;
2539                 *idev = grt->rt6i_idev;
2540                 dev_hold(dev);
2541                 in6_dev_hold(grt->rt6i_idev);
2542         }
2543
2544         if (!(grt->rt6i_flags & RTF_GATEWAY))
2545                 err = 0;
2546
2547         ip6_rt_put(grt);
2548
2549 out:
2550         return err;
2551 }
2552
2553 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2554                                               struct netlink_ext_ack *extack)
2555 {
2556         struct net *net = cfg->fc_nlinfo.nl_net;
2557         struct rt6_info *rt = NULL;
2558         struct net_device *dev = NULL;
2559         struct inet6_dev *idev = NULL;
2560         struct fib6_table *table;
2561         int addr_type;
2562         int err = -EINVAL;
2563
2564         /* RTF_PCPU is an internal flag; can not be set by userspace */
2565         if (cfg->fc_flags & RTF_PCPU) {
2566                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2567                 goto out;
2568         }
2569
2570         /* RTF_CACHE is an internal flag; can not be set by userspace */
2571         if (cfg->fc_flags & RTF_CACHE) {
2572                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2573                 goto out;
2574         }
2575
2576         if (cfg->fc_dst_len > 128) {
2577                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2578                 goto out;
2579         }
2580         if (cfg->fc_src_len > 128) {
2581                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2582                 goto out;
2583         }
2584 #ifndef CONFIG_IPV6_SUBTREES
2585         if (cfg->fc_src_len) {
2586                 NL_SET_ERR_MSG(extack,
2587                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2588                 goto out;
2589         }
2590 #endif
2591         if (cfg->fc_ifindex) {
2592                 err = -ENODEV;
2593                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2594                 if (!dev)
2595                         goto out;
2596                 idev = in6_dev_get(dev);
2597                 if (!idev)
2598                         goto out;
2599         }
2600
2601         if (cfg->fc_metric == 0)
2602                 cfg->fc_metric = IP6_RT_PRIO_USER;
2603
2604         if (cfg->fc_flags & RTNH_F_ONLINK) {
2605                 if (!dev) {
2606                         NL_SET_ERR_MSG(extack,
2607                                        "Nexthop device required for onlink");
2608                         err = -ENODEV;
2609                         goto out;
2610                 }
2611
2612                 if (!(dev->flags & IFF_UP)) {
2613                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2614                         err = -ENETDOWN;
2615                         goto out;
2616                 }
2617         }
2618
2619         err = -ENOBUFS;
2620         if (cfg->fc_nlinfo.nlh &&
2621             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2622                 table = fib6_get_table(net, cfg->fc_table);
2623                 if (!table) {
2624                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2625                         table = fib6_new_table(net, cfg->fc_table);
2626                 }
2627         } else {
2628                 table = fib6_new_table(net, cfg->fc_table);
2629         }
2630
2631         if (!table)
2632                 goto out;
2633
2634         rt = ip6_dst_alloc(net, NULL,
2635                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2636
2637         if (!rt) {
2638                 err = -ENOMEM;
2639                 goto out;
2640         }
2641
2642         if (cfg->fc_flags & RTF_EXPIRES)
2643                 rt6_set_expires(rt, jiffies +
2644                                 clock_t_to_jiffies(cfg->fc_expires));
2645         else
2646                 rt6_clean_expires(rt);
2647
2648         if (cfg->fc_protocol == RTPROT_UNSPEC)
2649                 cfg->fc_protocol = RTPROT_BOOT;
2650         rt->rt6i_protocol = cfg->fc_protocol;
2651
2652         addr_type = ipv6_addr_type(&cfg->fc_dst);
2653
2654         if (addr_type & IPV6_ADDR_MULTICAST)
2655                 rt->dst.input = ip6_mc_input;
2656         else if (cfg->fc_flags & RTF_LOCAL)
2657                 rt->dst.input = ip6_input;
2658         else
2659                 rt->dst.input = ip6_forward;
2660
2661         rt->dst.output = ip6_output;
2662
2663         if (cfg->fc_encap) {
2664                 struct lwtunnel_state *lwtstate;
2665
2666                 err = lwtunnel_build_state(cfg->fc_encap_type,
2667                                            cfg->fc_encap, AF_INET6, cfg,
2668                                            &lwtstate, extack);
2669                 if (err)
2670                         goto out;
2671                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2672                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2673                         rt->dst.lwtstate->orig_output = rt->dst.output;
2674                         rt->dst.output = lwtunnel_output;
2675                 }
2676                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2677                         rt->dst.lwtstate->orig_input = rt->dst.input;
2678                         rt->dst.input = lwtunnel_input;
2679                 }
2680         }
2681
2682         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2683         rt->rt6i_dst.plen = cfg->fc_dst_len;
2684         if (rt->rt6i_dst.plen == 128)
2685                 rt->dst.flags |= DST_HOST;
2686
2687 #ifdef CONFIG_IPV6_SUBTREES
2688         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2689         rt->rt6i_src.plen = cfg->fc_src_len;
2690 #endif
2691
2692         rt->rt6i_metric = cfg->fc_metric;
2693         rt->rt6i_nh_weight = 1;
2694
2695         /* We cannot add true routes via loopback here,
2696            they would result in kernel looping; promote them to reject routes
2697          */
2698         if ((cfg->fc_flags & RTF_REJECT) ||
2699             (dev && (dev->flags & IFF_LOOPBACK) &&
2700              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2701              !(cfg->fc_flags & RTF_LOCAL))) {
2702                 /* hold loopback dev/idev if we haven't done so. */
2703                 if (dev != net->loopback_dev) {
2704                         if (dev) {
2705                                 dev_put(dev);
2706                                 in6_dev_put(idev);
2707                         }
2708                         dev = net->loopback_dev;
2709                         dev_hold(dev);
2710                         idev = in6_dev_get(dev);
2711                         if (!idev) {
2712                                 err = -ENODEV;
2713                                 goto out;
2714                         }
2715                 }
2716                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2717                 switch (cfg->fc_type) {
2718                 case RTN_BLACKHOLE:
2719                         rt->dst.error = -EINVAL;
2720                         rt->dst.output = dst_discard_out;
2721                         rt->dst.input = dst_discard;
2722                         break;
2723                 case RTN_PROHIBIT:
2724                         rt->dst.error = -EACCES;
2725                         rt->dst.output = ip6_pkt_prohibit_out;
2726                         rt->dst.input = ip6_pkt_prohibit;
2727                         break;
2728                 case RTN_THROW:
2729                 case RTN_UNREACHABLE:
2730                 default:
2731                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2732                                         : (cfg->fc_type == RTN_UNREACHABLE)
2733                                         ? -EHOSTUNREACH : -ENETUNREACH;
2734                         rt->dst.output = ip6_pkt_discard_out;
2735                         rt->dst.input = ip6_pkt_discard;
2736                         break;
2737                 }
2738                 goto install_route;
2739         }
2740
2741         if (cfg->fc_flags & RTF_GATEWAY) {
2742                 const struct in6_addr *gw_addr;
2743                 int gwa_type;
2744
2745                 gw_addr = &cfg->fc_gateway;
2746                 gwa_type = ipv6_addr_type(gw_addr);
2747
2748                 /* if gw_addr is local we will fail to detect this in case
2749                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2750                  * will return already-added prefix route via interface that
2751                  * prefix route was assigned to, which might be non-loopback.
2752                  */
2753                 err = -EINVAL;
2754                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2755                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2756                                             dev : NULL, 0, 0)) {
2757                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2758                         goto out;
2759                 }
2760                 rt->rt6i_gateway = *gw_addr;
2761
2762                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2763                         /* IPv6 strictly inhibits using not link-local
2764                            addresses as nexthop address.
2765                            Otherwise, router will not able to send redirects.
2766                            It is very good, but in some (rare!) circumstances
2767                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2768                            some exceptions. --ANK
2769                            We allow IPv4-mapped nexthops to support RFC4798-type
2770                            addressing
2771                          */
2772                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2773                                           IPV6_ADDR_MAPPED))) {
2774                                 NL_SET_ERR_MSG(extack,
2775                                                "Invalid gateway address");
2776                                 goto out;
2777                         }
2778
2779                         if (cfg->fc_flags & RTNH_F_ONLINK) {
2780                                 err = ip6_route_check_nh_onlink(net, cfg, dev,
2781                                                                 extack);
2782                         } else {
2783                                 err = ip6_route_check_nh(net, cfg, &dev, &idev);
2784                         }
2785                         if (err)
2786                                 goto out;
2787                 }
2788                 err = -EINVAL;
2789                 if (!dev) {
2790                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2791                         goto out;
2792                 } else if (dev->flags & IFF_LOOPBACK) {
2793                         NL_SET_ERR_MSG(extack,
2794                                        "Egress device can not be loopback device for this route");
2795                         goto out;
2796                 }
2797         }
2798
2799         err = -ENODEV;
2800         if (!dev)
2801                 goto out;
2802
2803         if (!(dev->flags & IFF_UP)) {
2804                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2805                 err = -ENETDOWN;
2806                 goto out;
2807         }
2808
2809         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2810                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2811                         NL_SET_ERR_MSG(extack, "Invalid source address");
2812                         err = -EINVAL;
2813                         goto out;
2814                 }
2815                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2816                 rt->rt6i_prefsrc.plen = 128;
2817         } else
2818                 rt->rt6i_prefsrc.plen = 0;
2819
2820         rt->rt6i_flags = cfg->fc_flags;
2821
2822 install_route:
2823         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2824             !netif_carrier_ok(dev))
2825                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2826         rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2827         rt->dst.dev = dev;
2828         rt->rt6i_idev = idev;
2829         rt->rt6i_table = table;
2830
2831         cfg->fc_nlinfo.nl_net = dev_net(dev);
2832
2833         return rt;
2834 out:
2835         if (dev)
2836                 dev_put(dev);
2837         if (idev)
2838                 in6_dev_put(idev);
2839         if (rt)
2840                 dst_release_immediate(&rt->dst);
2841
2842         return ERR_PTR(err);
2843 }
2844
2845 int ip6_route_add(struct fib6_config *cfg,
2846                   struct netlink_ext_ack *extack)
2847 {
2848         struct mx6_config mxc = { .mx = NULL, };
2849         struct rt6_info *rt;
2850         int err;
2851
2852         rt = ip6_route_info_create(cfg, extack);
2853         if (IS_ERR(rt)) {
2854                 err = PTR_ERR(rt);
2855                 rt = NULL;
2856                 goto out;
2857         }
2858
2859         err = ip6_convert_metrics(&mxc, cfg);
2860         if (err)
2861                 goto out;
2862
2863         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2864
2865         kfree(mxc.mx);
2866
2867         return err;
2868 out:
2869         if (rt)
2870                 dst_release_immediate(&rt->dst);
2871
2872         return err;
2873 }
2874
2875 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2876 {
2877         int err;
2878         struct fib6_table *table;
2879         struct net *net = dev_net(rt->dst.dev);
2880
2881         if (rt == net->ipv6.ip6_null_entry) {
2882                 err = -ENOENT;
2883                 goto out;
2884         }
2885
2886         table = rt->rt6i_table;
2887         spin_lock_bh(&table->tb6_lock);
2888         err = fib6_del(rt, info);
2889         spin_unlock_bh(&table->tb6_lock);
2890
2891 out:
2892         ip6_rt_put(rt);
2893         return err;
2894 }
2895
2896 int ip6_del_rt(struct rt6_info *rt)
2897 {
2898         struct nl_info info = {
2899                 .nl_net = dev_net(rt->dst.dev),
2900         };
2901         return __ip6_del_rt(rt, &info);
2902 }
2903
2904 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2905 {
2906         struct nl_info *info = &cfg->fc_nlinfo;
2907         struct net *net = info->nl_net;
2908         struct sk_buff *skb = NULL;
2909         struct fib6_table *table;
2910         int err = -ENOENT;
2911
2912         if (rt == net->ipv6.ip6_null_entry)
2913                 goto out_put;
2914         table = rt->rt6i_table;
2915         spin_lock_bh(&table->tb6_lock);
2916
2917         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2918                 struct rt6_info *sibling, *next_sibling;
2919
2920                 /* prefer to send a single notification with all hops */
2921                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2922                 if (skb) {
2923                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2924
2925                         if (rt6_fill_node(net, skb, rt,
2926                                           NULL, NULL, 0, RTM_DELROUTE,
2927                                           info->portid, seq, 0) < 0) {
2928                                 kfree_skb(skb);
2929                                 skb = NULL;
2930                         } else
2931                                 info->skip_notify = 1;
2932                 }
2933
2934                 list_for_each_entry_safe(sibling, next_sibling,
2935                                          &rt->rt6i_siblings,
2936                                          rt6i_siblings) {
2937                         err = fib6_del(sibling, info);
2938                         if (err)
2939                                 goto out_unlock;
2940                 }
2941         }
2942
2943         err = fib6_del(rt, info);
2944 out_unlock:
2945         spin_unlock_bh(&table->tb6_lock);
2946 out_put:
2947         ip6_rt_put(rt);
2948
2949         if (skb) {
2950                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2951                             info->nlh, gfp_any());
2952         }
2953         return err;
2954 }
2955
2956 static int ip6_route_del(struct fib6_config *cfg,
2957                          struct netlink_ext_ack *extack)
2958 {
2959         struct rt6_info *rt, *rt_cache;
2960         struct fib6_table *table;
2961         struct fib6_node *fn;
2962         int err = -ESRCH;
2963
2964         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2965         if (!table) {
2966                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2967                 return err;
2968         }
2969
2970         rcu_read_lock();
2971
2972         fn = fib6_locate(&table->tb6_root,
2973                          &cfg->fc_dst, cfg->fc_dst_len,
2974                          &cfg->fc_src, cfg->fc_src_len,
2975                          !(cfg->fc_flags & RTF_CACHE));
2976
2977         if (fn) {
2978                 for_each_fib6_node_rt_rcu(fn) {
2979                         if (cfg->fc_flags & RTF_CACHE) {
2980                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2981                                                               &cfg->fc_src);
2982                                 if (!rt_cache)
2983                                         continue;
2984                                 rt = rt_cache;
2985                         }
2986                         if (cfg->fc_ifindex &&
2987                             (!rt->dst.dev ||
2988                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2989                                 continue;
2990                         if (cfg->fc_flags & RTF_GATEWAY &&
2991                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2992                                 continue;
2993                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2994                                 continue;
2995                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2996                                 continue;
2997                         if (!dst_hold_safe(&rt->dst))
2998                                 break;
2999                         rcu_read_unlock();
3000
3001                         /* if gateway was specified only delete the one hop */
3002                         if (cfg->fc_flags & RTF_GATEWAY)
3003                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3004
3005                         return __ip6_del_rt_siblings(rt, cfg);
3006                 }
3007         }
3008         rcu_read_unlock();
3009
3010         return err;
3011 }
3012
3013 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3014 {
3015         struct netevent_redirect netevent;
3016         struct rt6_info *rt, *nrt = NULL;
3017         struct ndisc_options ndopts;
3018         struct inet6_dev *in6_dev;
3019         struct neighbour *neigh;
3020         struct rd_msg *msg;
3021         int optlen, on_link;
3022         u8 *lladdr;
3023
3024         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3025         optlen -= sizeof(*msg);
3026
3027         if (optlen < 0) {
3028                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3029                 return;
3030         }
3031
3032         msg = (struct rd_msg *)icmp6_hdr(skb);
3033
3034         if (ipv6_addr_is_multicast(&msg->dest)) {
3035                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3036                 return;
3037         }
3038
3039         on_link = 0;
3040         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3041                 on_link = 1;
3042         } else if (ipv6_addr_type(&msg->target) !=
3043                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3044                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3045                 return;
3046         }
3047
3048         in6_dev = __in6_dev_get(skb->dev);
3049         if (!in6_dev)
3050                 return;
3051         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3052                 return;
3053
3054         /* RFC2461 8.1:
3055          *      The IP source address of the Redirect MUST be the same as the current
3056          *      first-hop router for the specified ICMP Destination Address.
3057          */
3058
3059         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3060                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3061                 return;
3062         }
3063
3064         lladdr = NULL;
3065         if (ndopts.nd_opts_tgt_lladdr) {
3066                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3067                                              skb->dev);
3068                 if (!lladdr) {
3069                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3070                         return;
3071                 }
3072         }
3073
3074         rt = (struct rt6_info *) dst;
3075         if (rt->rt6i_flags & RTF_REJECT) {
3076                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3077                 return;
3078         }
3079
3080         /* Redirect received -> path was valid.
3081          * Look, redirects are sent only in response to data packets,
3082          * so that this nexthop apparently is reachable. --ANK
3083          */
3084         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3085
3086         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3087         if (!neigh)
3088                 return;
3089
3090         /*
3091          *      We have finally decided to accept it.
3092          */
3093
3094         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3095                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3096                      NEIGH_UPDATE_F_OVERRIDE|
3097                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3098                                      NEIGH_UPDATE_F_ISROUTER)),
3099                      NDISC_REDIRECT, &ndopts);
3100
3101         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3102         if (!nrt)
3103                 goto out;
3104
3105         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3106         if (on_link)
3107                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3108
3109         nrt->rt6i_protocol = RTPROT_REDIRECT;
3110         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3111
3112         /* No need to remove rt from the exception table if rt is
3113          * a cached route because rt6_insert_exception() will
3114          * takes care of it
3115          */
3116         if (rt6_insert_exception(nrt, rt)) {
3117                 dst_release_immediate(&nrt->dst);
3118                 goto out;
3119         }
3120
3121         netevent.old = &rt->dst;
3122         netevent.new = &nrt->dst;
3123         netevent.daddr = &msg->dest;
3124         netevent.neigh = neigh;
3125         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3126
3127 out:
3128         neigh_release(neigh);
3129 }
3130
3131 /*
3132  *      Misc support functions
3133  */
3134
3135 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3136 {
3137         BUG_ON(from->from);
3138
3139         rt->rt6i_flags &= ~RTF_EXPIRES;
3140         dst_hold(&from->dst);
3141         rt->from = from;
3142         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3143 }
3144
3145 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3146 {
3147         rt->dst.input = ort->dst.input;
3148         rt->dst.output = ort->dst.output;
3149         rt->rt6i_dst = ort->rt6i_dst;
3150         rt->dst.error = ort->dst.error;
3151         rt->rt6i_idev = ort->rt6i_idev;
3152         if (rt->rt6i_idev)
3153                 in6_dev_hold(rt->rt6i_idev);
3154         rt->dst.lastuse = jiffies;
3155         rt->rt6i_gateway = ort->rt6i_gateway;
3156         rt->rt6i_flags = ort->rt6i_flags;
3157         rt6_set_from(rt, ort);
3158         rt->rt6i_metric = ort->rt6i_metric;
3159 #ifdef CONFIG_IPV6_SUBTREES
3160         rt->rt6i_src = ort->rt6i_src;
3161 #endif
3162         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3163         rt->rt6i_table = ort->rt6i_table;
3164         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3165 }
3166
3167 #ifdef CONFIG_IPV6_ROUTE_INFO
3168 static struct rt6_info *rt6_get_route_info(struct net *net,
3169                                            const struct in6_addr *prefix, int prefixlen,
3170                                            const struct in6_addr *gwaddr,
3171                                            struct net_device *dev)
3172 {
3173         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3174         int ifindex = dev->ifindex;
3175         struct fib6_node *fn;
3176         struct rt6_info *rt = NULL;
3177         struct fib6_table *table;
3178
3179         table = fib6_get_table(net, tb_id);
3180         if (!table)
3181                 return NULL;
3182
3183         rcu_read_lock();
3184         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3185         if (!fn)
3186                 goto out;
3187
3188         for_each_fib6_node_rt_rcu(fn) {
3189                 if (rt->dst.dev->ifindex != ifindex)
3190                         continue;
3191                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3192                         continue;
3193                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3194                         continue;
3195                 ip6_hold_safe(NULL, &rt, false);
3196                 break;
3197         }
3198 out:
3199         rcu_read_unlock();
3200         return rt;
3201 }
3202
3203 static struct rt6_info *rt6_add_route_info(struct net *net,
3204                                            const struct in6_addr *prefix, int prefixlen,
3205                                            const struct in6_addr *gwaddr,
3206                                            struct net_device *dev,
3207                                            unsigned int pref)
3208 {
3209         struct fib6_config cfg = {
3210                 .fc_metric      = IP6_RT_PRIO_USER,
3211                 .fc_ifindex     = dev->ifindex,
3212                 .fc_dst_len     = prefixlen,
3213                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3214                                   RTF_UP | RTF_PREF(pref),
3215                 .fc_protocol = RTPROT_RA,
3216                 .fc_nlinfo.portid = 0,
3217                 .fc_nlinfo.nlh = NULL,
3218                 .fc_nlinfo.nl_net = net,
3219         };
3220
3221         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3222         cfg.fc_dst = *prefix;
3223         cfg.fc_gateway = *gwaddr;
3224
3225         /* We should treat it as a default route if prefix length is 0. */
3226         if (!prefixlen)
3227                 cfg.fc_flags |= RTF_DEFAULT;
3228
3229         ip6_route_add(&cfg, NULL);
3230
3231         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3232 }
3233 #endif
3234
3235 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3236 {
3237         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3238         struct rt6_info *rt;
3239         struct fib6_table *table;
3240
3241         table = fib6_get_table(dev_net(dev), tb_id);
3242         if (!table)
3243                 return NULL;
3244
3245         rcu_read_lock();
3246         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3247                 if (dev == rt->dst.dev &&
3248                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3249                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3250                         break;
3251         }
3252         if (rt)
3253                 ip6_hold_safe(NULL, &rt, false);
3254         rcu_read_unlock();
3255         return rt;
3256 }
3257
3258 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3259                                      struct net_device *dev,
3260                                      unsigned int pref)
3261 {
3262         struct fib6_config cfg = {
3263                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3264                 .fc_metric      = IP6_RT_PRIO_USER,
3265                 .fc_ifindex     = dev->ifindex,
3266                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3267                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3268                 .fc_protocol = RTPROT_RA,
3269                 .fc_nlinfo.portid = 0,
3270                 .fc_nlinfo.nlh = NULL,
3271                 .fc_nlinfo.nl_net = dev_net(dev),
3272         };
3273
3274         cfg.fc_gateway = *gwaddr;
3275
3276         if (!ip6_route_add(&cfg, NULL)) {
3277                 struct fib6_table *table;
3278
3279                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3280                 if (table)
3281                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3282         }
3283
3284         return rt6_get_dflt_router(gwaddr, dev);
3285 }
3286
3287 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3288 {
3289         struct rt6_info *rt;
3290
3291 restart:
3292         rcu_read_lock();
3293         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3294                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3295                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3296                         if (dst_hold_safe(&rt->dst)) {
3297                                 rcu_read_unlock();
3298                                 ip6_del_rt(rt);
3299                         } else {
3300                                 rcu_read_unlock();
3301                         }
3302                         goto restart;
3303                 }
3304         }
3305         rcu_read_unlock();
3306
3307         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3308 }
3309
3310 void rt6_purge_dflt_routers(struct net *net)
3311 {
3312         struct fib6_table *table;
3313         struct hlist_head *head;
3314         unsigned int h;
3315
3316         rcu_read_lock();
3317
3318         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3319                 head = &net->ipv6.fib_table_hash[h];
3320                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3321                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3322                                 __rt6_purge_dflt_routers(table);
3323                 }
3324         }
3325
3326         rcu_read_unlock();
3327 }
3328
3329 static void rtmsg_to_fib6_config(struct net *net,
3330                                  struct in6_rtmsg *rtmsg,
3331                                  struct fib6_config *cfg)
3332 {
3333         memset(cfg, 0, sizeof(*cfg));
3334
3335         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3336                          : RT6_TABLE_MAIN;
3337         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3338         cfg->fc_metric = rtmsg->rtmsg_metric;
3339         cfg->fc_expires = rtmsg->rtmsg_info;
3340         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3341         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3342         cfg->fc_flags = rtmsg->rtmsg_flags;
3343
3344         cfg->fc_nlinfo.nl_net = net;
3345
3346         cfg->fc_dst = rtmsg->rtmsg_dst;
3347         cfg->fc_src = rtmsg->rtmsg_src;
3348         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3349 }
3350
3351 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3352 {
3353         struct fib6_config cfg;
3354         struct in6_rtmsg rtmsg;
3355         int err;
3356
3357         switch (cmd) {
3358         case SIOCADDRT:         /* Add a route */
3359         case SIOCDELRT:         /* Delete a route */
3360                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3361                         return -EPERM;
3362                 err = copy_from_user(&rtmsg, arg,
3363                                      sizeof(struct in6_rtmsg));
3364                 if (err)
3365                         return -EFAULT;
3366
3367                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3368
3369                 rtnl_lock();
3370                 switch (cmd) {
3371                 case SIOCADDRT:
3372                         err = ip6_route_add(&cfg, NULL);
3373                         break;
3374                 case SIOCDELRT:
3375                         err = ip6_route_del(&cfg, NULL);
3376                         break;
3377                 default:
3378                         err = -EINVAL;
3379                 }
3380                 rtnl_unlock();
3381
3382                 return err;
3383         }
3384
3385         return -EINVAL;
3386 }
3387
3388 /*
3389  *      Drop the packet on the floor
3390  */
3391
3392 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3393 {
3394         int type;
3395         struct dst_entry *dst = skb_dst(skb);
3396         switch (ipstats_mib_noroutes) {
3397         case IPSTATS_MIB_INNOROUTES:
3398                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3399                 if (type == IPV6_ADDR_ANY) {
3400                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3401                                       IPSTATS_MIB_INADDRERRORS);
3402                         break;
3403                 }
3404                 /* FALLTHROUGH */
3405         case IPSTATS_MIB_OUTNOROUTES:
3406                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3407                               ipstats_mib_noroutes);
3408                 break;
3409         }
3410         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3411         kfree_skb(skb);
3412         return 0;
3413 }
3414
3415 static int ip6_pkt_discard(struct sk_buff *skb)
3416 {
3417         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3418 }
3419
3420 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3421 {
3422         skb->dev = skb_dst(skb)->dev;
3423         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3424 }
3425
3426 static int ip6_pkt_prohibit(struct sk_buff *skb)
3427 {
3428         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3429 }
3430
3431 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3432 {
3433         skb->dev = skb_dst(skb)->dev;
3434         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3435 }
3436
3437 /*
3438  *      Allocate a dst for local (unicast / anycast) address.
3439  */
3440
3441 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3442                                     const struct in6_addr *addr,
3443                                     bool anycast)
3444 {
3445         u32 tb_id;
3446         struct net *net = dev_net(idev->dev);
3447         struct net_device *dev = idev->dev;
3448         struct rt6_info *rt;
3449
3450         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3451         if (!rt)
3452                 return ERR_PTR(-ENOMEM);
3453
3454         in6_dev_hold(idev);
3455
3456         rt->dst.flags |= DST_HOST;
3457         rt->dst.input = ip6_input;
3458         rt->dst.output = ip6_output;
3459         rt->rt6i_idev = idev;
3460
3461         rt->rt6i_protocol = RTPROT_KERNEL;
3462         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3463         if (anycast)
3464                 rt->rt6i_flags |= RTF_ANYCAST;
3465         else
3466                 rt->rt6i_flags |= RTF_LOCAL;
3467
3468         rt->rt6i_gateway  = *addr;
3469         rt->rt6i_dst.addr = *addr;
3470         rt->rt6i_dst.plen = 128;
3471         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3472         rt->rt6i_table = fib6_get_table(net, tb_id);
3473
3474         return rt;
3475 }
3476
3477 /* remove deleted ip from prefsrc entries */
3478 struct arg_dev_net_ip {
3479         struct net_device *dev;
3480         struct net *net;
3481         struct in6_addr *addr;
3482 };
3483
3484 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3485 {
3486         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3487         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3488         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3489
3490         if (((void *)rt->dst.dev == dev || !dev) &&
3491             rt != net->ipv6.ip6_null_entry &&
3492             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3493                 spin_lock_bh(&rt6_exception_lock);
3494                 /* remove prefsrc entry */
3495                 rt->rt6i_prefsrc.plen = 0;
3496                 /* need to update cache as well */
3497                 rt6_exceptions_remove_prefsrc(rt);
3498                 spin_unlock_bh(&rt6_exception_lock);
3499         }
3500         return 0;
3501 }
3502
3503 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3504 {
3505         struct net *net = dev_net(ifp->idev->dev);
3506         struct arg_dev_net_ip adni = {
3507                 .dev = ifp->idev->dev,
3508                 .net = net,
3509                 .addr = &ifp->addr,
3510         };
3511         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3512 }
3513
3514 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3515
3516 /* Remove routers and update dst entries when gateway turn into host. */
3517 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3518 {
3519         struct in6_addr *gateway = (struct in6_addr *)arg;
3520
3521         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3522             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3523                 return -1;
3524         }
3525
3526         /* Further clean up cached routes in exception table.
3527          * This is needed because cached route may have a different
3528          * gateway than its 'parent' in the case of an ip redirect.
3529          */
3530         rt6_exceptions_clean_tohost(rt, gateway);
3531
3532         return 0;
3533 }
3534
3535 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3536 {
3537         fib6_clean_all(net, fib6_clean_tohost, gateway);
3538 }
3539
3540 struct arg_netdev_event {
3541         const struct net_device *dev;
3542         union {
3543                 unsigned int nh_flags;
3544                 unsigned long event;
3545         };
3546 };
3547
3548 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3549 {
3550         struct rt6_info *iter;
3551         struct fib6_node *fn;
3552
3553         fn = rcu_dereference_protected(rt->rt6i_node,
3554                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3555         iter = rcu_dereference_protected(fn->leaf,
3556                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3557         while (iter) {
3558                 if (iter->rt6i_metric == rt->rt6i_metric &&
3559                     rt6_qualify_for_ecmp(iter))
3560                         return iter;
3561                 iter = rcu_dereference_protected(iter->rt6_next,
3562                                 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3563         }
3564
3565         return NULL;
3566 }
3567
3568 static bool rt6_is_dead(const struct rt6_info *rt)
3569 {
3570         if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3571             (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3572              rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3573                 return true;
3574
3575         return false;
3576 }
3577
3578 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3579 {
3580         struct rt6_info *iter;
3581         int total = 0;
3582
3583         if (!rt6_is_dead(rt))
3584                 total += rt->rt6i_nh_weight;
3585
3586         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3587                 if (!rt6_is_dead(iter))
3588                         total += iter->rt6i_nh_weight;
3589         }
3590
3591         return total;
3592 }
3593
3594 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3595 {
3596         int upper_bound = -1;
3597
3598         if (!rt6_is_dead(rt)) {
3599                 *weight += rt->rt6i_nh_weight;
3600                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3601                                                     total) - 1;
3602         }
3603         atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3604 }
3605
3606 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3607 {
3608         struct rt6_info *iter;
3609         int weight = 0;
3610
3611         rt6_upper_bound_set(rt, &weight, total);
3612
3613         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3614                 rt6_upper_bound_set(iter, &weight, total);
3615 }
3616
3617 void rt6_multipath_rebalance(struct rt6_info *rt)
3618 {
3619         struct rt6_info *first;
3620         int total;
3621
3622         /* In case the entire multipath route was marked for flushing,
3623          * then there is no need to rebalance upon the removal of every
3624          * sibling route.
3625          */
3626         if (!rt->rt6i_nsiblings || rt->should_flush)
3627                 return;
3628
3629         /* During lookup routes are evaluated in order, so we need to
3630          * make sure upper bounds are assigned from the first sibling
3631          * onwards.
3632          */
3633         first = rt6_multipath_first_sibling(rt);
3634         if (WARN_ON_ONCE(!first))
3635                 return;
3636
3637         total = rt6_multipath_total_weight(first);
3638         rt6_multipath_upper_bound_set(first, total);
3639 }
3640
3641 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3642 {
3643         const struct arg_netdev_event *arg = p_arg;
3644         const struct net *net = dev_net(arg->dev);
3645
3646         if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3647                 rt->rt6i_nh_flags &= ~arg->nh_flags;
3648                 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3649                 rt6_multipath_rebalance(rt);
3650         }
3651
3652         return 0;
3653 }
3654
3655 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3656 {
3657         struct arg_netdev_event arg = {
3658                 .dev = dev,
3659                 {
3660                         .nh_flags = nh_flags,
3661                 },
3662         };
3663
3664         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3665                 arg.nh_flags |= RTNH_F_LINKDOWN;
3666
3667         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3668 }
3669
3670 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3671                                    const struct net_device *dev)
3672 {
3673         struct rt6_info *iter;
3674
3675         if (rt->dst.dev == dev)
3676                 return true;
3677         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3678                 if (iter->dst.dev == dev)
3679                         return true;
3680
3681         return false;
3682 }
3683
3684 static void rt6_multipath_flush(struct rt6_info *rt)
3685 {
3686         struct rt6_info *iter;
3687
3688         rt->should_flush = 1;
3689         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3690                 iter->should_flush = 1;
3691 }
3692
3693 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3694                                              const struct net_device *down_dev)
3695 {
3696         struct rt6_info *iter;
3697         unsigned int dead = 0;
3698
3699         if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3700                 dead++;
3701         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3702                 if (iter->dst.dev == down_dev ||
3703                     iter->rt6i_nh_flags & RTNH_F_DEAD)
3704                         dead++;
3705
3706         return dead;
3707 }
3708
3709 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3710                                        const struct net_device *dev,
3711                                        unsigned int nh_flags)
3712 {
3713         struct rt6_info *iter;
3714
3715         if (rt->dst.dev == dev)
3716                 rt->rt6i_nh_flags |= nh_flags;
3717         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3718                 if (iter->dst.dev == dev)
3719                         iter->rt6i_nh_flags |= nh_flags;
3720 }
3721
3722 /* called with write lock held for table with rt */
3723 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3724 {
3725         const struct arg_netdev_event *arg = p_arg;
3726         const struct net_device *dev = arg->dev;
3727         const struct net *net = dev_net(dev);
3728
3729         if (rt == net->ipv6.ip6_null_entry)
3730                 return 0;
3731
3732         switch (arg->event) {
3733         case NETDEV_UNREGISTER:
3734                 return rt->dst.dev == dev ? -1 : 0;
3735         case NETDEV_DOWN:
3736                 if (rt->should_flush)
3737                         return -1;
3738                 if (!rt->rt6i_nsiblings)
3739                         return rt->dst.dev == dev ? -1 : 0;
3740                 if (rt6_multipath_uses_dev(rt, dev)) {
3741                         unsigned int count;
3742
3743                         count = rt6_multipath_dead_count(rt, dev);
3744                         if (rt->rt6i_nsiblings + 1 == count) {
3745                                 rt6_multipath_flush(rt);
3746                                 return -1;
3747                         }
3748                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3749                                                    RTNH_F_LINKDOWN);
3750                         fib6_update_sernum(rt);
3751                         rt6_multipath_rebalance(rt);
3752                 }
3753                 return -2;
3754         case NETDEV_CHANGE:
3755                 if (rt->dst.dev != dev ||
3756                     rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3757                         break;
3758                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3759                 rt6_multipath_rebalance(rt);
3760                 break;
3761         }
3762
3763         return 0;
3764 }
3765
3766 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3767 {
3768         struct arg_netdev_event arg = {
3769                 .dev = dev,
3770                 {
3771                         .event = event,
3772                 },
3773         };
3774
3775         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3776 }
3777
3778 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3779 {
3780         rt6_sync_down_dev(dev, event);
3781         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3782         neigh_ifdown(&nd_tbl, dev);
3783 }
3784
3785 struct rt6_mtu_change_arg {
3786         struct net_device *dev;
3787         unsigned int mtu;
3788 };
3789
3790 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3791 {
3792         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3793         struct inet6_dev *idev;
3794
3795         /* In IPv6 pmtu discovery is not optional,
3796            so that RTAX_MTU lock cannot disable it.
3797            We still use this lock to block changes
3798            caused by addrconf/ndisc.
3799         */
3800
3801         idev = __in6_dev_get(arg->dev);
3802         if (!idev)
3803                 return 0;
3804
3805         /* For administrative MTU increase, there is no way to discover
3806            IPv6 PMTU increase, so PMTU increase should be updated here.
3807            Since RFC 1981 doesn't include administrative MTU increase
3808            update PMTU increase is a MUST. (i.e. jumbo frame)
3809          */
3810         /*
3811            If new MTU is less than route PMTU, this new MTU will be the
3812            lowest MTU in the path, update the route PMTU to reflect PMTU
3813            decreases; if new MTU is greater than route PMTU, and the
3814            old MTU is the lowest MTU in the path, update the route PMTU
3815            to reflect the increase. In this case if the other nodes' MTU
3816            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3817            PMTU discovery.
3818          */
3819         if (rt->dst.dev == arg->dev &&
3820             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3821             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3822                 spin_lock_bh(&rt6_exception_lock);
3823                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3824                     (dst_mtu(&rt->dst) < arg->mtu &&
3825                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3826                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3827                 }
3828                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3829                 spin_unlock_bh(&rt6_exception_lock);
3830         }
3831         return 0;
3832 }
3833
3834 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3835 {
3836         struct rt6_mtu_change_arg arg = {
3837                 .dev = dev,
3838                 .mtu = mtu,
3839         };
3840
3841         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3842 }
3843
3844 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3845         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3846         [RTA_OIF]               = { .type = NLA_U32 },
3847         [RTA_IIF]               = { .type = NLA_U32 },
3848         [RTA_PRIORITY]          = { .type = NLA_U32 },
3849         [RTA_METRICS]           = { .type = NLA_NESTED },
3850         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3851         [RTA_PREF]              = { .type = NLA_U8 },
3852         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3853         [RTA_ENCAP]             = { .type = NLA_NESTED },
3854         [RTA_EXPIRES]           = { .type = NLA_U32 },
3855         [RTA_UID]               = { .type = NLA_U32 },
3856         [RTA_MARK]              = { .type = NLA_U32 },
3857 };
3858
3859 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3860                               struct fib6_config *cfg,
3861                               struct netlink_ext_ack *extack)
3862 {
3863         struct rtmsg *rtm;
3864         struct nlattr *tb[RTA_MAX+1];
3865         unsigned int pref;
3866         int err;
3867
3868         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3869                           NULL);
3870         if (err < 0)
3871                 goto errout;
3872
3873         err = -EINVAL;
3874         rtm = nlmsg_data(nlh);
3875         memset(cfg, 0, sizeof(*cfg));
3876
3877         cfg->fc_table = rtm->rtm_table;
3878         cfg->fc_dst_len = rtm->rtm_dst_len;
3879         cfg->fc_src_len = rtm->rtm_src_len;
3880         cfg->fc_flags = RTF_UP;
3881         cfg->fc_protocol = rtm->rtm_protocol;
3882         cfg->fc_type = rtm->rtm_type;
3883
3884         if (rtm->rtm_type == RTN_UNREACHABLE ||
3885             rtm->rtm_type == RTN_BLACKHOLE ||
3886             rtm->rtm_type == RTN_PROHIBIT ||
3887             rtm->rtm_type == RTN_THROW)
3888                 cfg->fc_flags |= RTF_REJECT;
3889
3890         if (rtm->rtm_type == RTN_LOCAL)
3891                 cfg->fc_flags |= RTF_LOCAL;
3892
3893         if (rtm->rtm_flags & RTM_F_CLONED)
3894                 cfg->fc_flags |= RTF_CACHE;
3895
3896         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
3897
3898         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3899         cfg->fc_nlinfo.nlh = nlh;
3900         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3901
3902         if (tb[RTA_GATEWAY]) {
3903                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3904                 cfg->fc_flags |= RTF_GATEWAY;
3905         }
3906
3907         if (tb[RTA_DST]) {
3908                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3909
3910                 if (nla_len(tb[RTA_DST]) < plen)
3911                         goto errout;
3912
3913                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3914         }
3915
3916         if (tb[RTA_SRC]) {
3917                 int plen = (rtm->rtm_src_len + 7) >> 3;
3918
3919                 if (nla_len(tb[RTA_SRC]) < plen)
3920                         goto errout;
3921
3922                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3923         }
3924
3925         if (tb[RTA_PREFSRC])
3926                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3927
3928         if (tb[RTA_OIF])
3929                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3930
3931         if (tb[RTA_PRIORITY])
3932                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3933
3934         if (tb[RTA_METRICS]) {
3935                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3936                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3937         }
3938
3939         if (tb[RTA_TABLE])
3940                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3941
3942         if (tb[RTA_MULTIPATH]) {
3943                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3944                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3945
3946                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3947                                                      cfg->fc_mp_len, extack);
3948                 if (err < 0)
3949                         goto errout;
3950         }
3951
3952         if (tb[RTA_PREF]) {
3953                 pref = nla_get_u8(tb[RTA_PREF]);
3954                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3955                     pref != ICMPV6_ROUTER_PREF_HIGH)
3956                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
3957                 cfg->fc_flags |= RTF_PREF(pref);
3958         }
3959
3960         if (tb[RTA_ENCAP])
3961                 cfg->fc_encap = tb[RTA_ENCAP];
3962
3963         if (tb[RTA_ENCAP_TYPE]) {
3964                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3965
3966                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3967                 if (err < 0)
3968                         goto errout;
3969         }
3970
3971         if (tb[RTA_EXPIRES]) {
3972                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3973
3974                 if (addrconf_finite_timeout(timeout)) {
3975                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3976                         cfg->fc_flags |= RTF_EXPIRES;
3977                 }
3978         }
3979
3980         err = 0;
3981 errout:
3982         return err;
3983 }
3984
3985 struct rt6_nh {
3986         struct rt6_info *rt6_info;
3987         struct fib6_config r_cfg;
3988         struct mx6_config mxc;
3989         struct list_head next;
3990 };
3991
3992 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3993 {
3994         struct rt6_nh *nh;
3995
3996         list_for_each_entry(nh, rt6_nh_list, next) {
3997                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3998                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3999                         nh->r_cfg.fc_ifindex);
4000         }
4001 }
4002
4003 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4004                                  struct rt6_info *rt, struct fib6_config *r_cfg)
4005 {
4006         struct rt6_nh *nh;
4007         int err = -EEXIST;
4008
4009         list_for_each_entry(nh, rt6_nh_list, next) {
4010                 /* check if rt6_info already exists */
4011                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4012                         return err;
4013         }
4014
4015         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4016         if (!nh)
4017                 return -ENOMEM;
4018         nh->rt6_info = rt;
4019         err = ip6_convert_metrics(&nh->mxc, r_cfg);
4020         if (err) {
4021                 kfree(nh);
4022                 return err;
4023         }
4024         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4025         list_add_tail(&nh->next, rt6_nh_list);
4026
4027         return 0;
4028 }
4029
4030 static void ip6_route_mpath_notify(struct rt6_info *rt,
4031                                    struct rt6_info *rt_last,
4032                                    struct nl_info *info,
4033                                    __u16 nlflags)
4034 {
4035         /* if this is an APPEND route, then rt points to the first route
4036          * inserted and rt_last points to last route inserted. Userspace
4037          * wants a consistent dump of the route which starts at the first
4038          * nexthop. Since sibling routes are always added at the end of
4039          * the list, find the first sibling of the last route appended
4040          */
4041         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4042                 rt = list_first_entry(&rt_last->rt6i_siblings,
4043                                       struct rt6_info,
4044                                       rt6i_siblings);
4045         }
4046
4047         if (rt)
4048                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4049 }
4050
4051 static int ip6_route_multipath_add(struct fib6_config *cfg,
4052                                    struct netlink_ext_ack *extack)
4053 {
4054         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4055         struct nl_info *info = &cfg->fc_nlinfo;
4056         struct fib6_config r_cfg;
4057         struct rtnexthop *rtnh;
4058         struct rt6_info *rt;
4059         struct rt6_nh *err_nh;
4060         struct rt6_nh *nh, *nh_safe;
4061         __u16 nlflags;
4062         int remaining;
4063         int attrlen;
4064         int err = 1;
4065         int nhn = 0;
4066         int replace = (cfg->fc_nlinfo.nlh &&
4067                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4068         LIST_HEAD(rt6_nh_list);
4069
4070         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4071         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4072                 nlflags |= NLM_F_APPEND;
4073
4074         remaining = cfg->fc_mp_len;
4075         rtnh = (struct rtnexthop *)cfg->fc_mp;
4076
4077         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4078          * rt6_info structs per nexthop
4079          */
4080         while (rtnh_ok(rtnh, remaining)) {
4081                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4082                 if (rtnh->rtnh_ifindex)
4083                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4084
4085                 attrlen = rtnh_attrlen(rtnh);
4086                 if (attrlen > 0) {
4087                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4088
4089                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4090                         if (nla) {
4091                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4092                                 r_cfg.fc_flags |= RTF_GATEWAY;
4093                         }
4094                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4095                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4096                         if (nla)
4097                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4098                 }
4099
4100                 rt = ip6_route_info_create(&r_cfg, extack);
4101                 if (IS_ERR(rt)) {
4102                         err = PTR_ERR(rt);
4103                         rt = NULL;
4104                         goto cleanup;
4105                 }
4106
4107                 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4108
4109                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4110                 if (err) {
4111                         dst_release_immediate(&rt->dst);
4112                         goto cleanup;
4113                 }
4114
4115                 rtnh = rtnh_next(rtnh, &remaining);
4116         }
4117
4118         /* for add and replace send one notification with all nexthops.
4119          * Skip the notification in fib6_add_rt2node and send one with
4120          * the full route when done
4121          */
4122         info->skip_notify = 1;
4123
4124         err_nh = NULL;
4125         list_for_each_entry(nh, &rt6_nh_list, next) {
4126                 rt_last = nh->rt6_info;
4127                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4128                 /* save reference to first route for notification */
4129                 if (!rt_notif && !err)
4130                         rt_notif = nh->rt6_info;
4131
4132                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4133                 nh->rt6_info = NULL;
4134                 if (err) {
4135                         if (replace && nhn)
4136                                 ip6_print_replace_route_err(&rt6_nh_list);
4137                         err_nh = nh;
4138                         goto add_errout;
4139                 }
4140
4141                 /* Because each route is added like a single route we remove
4142                  * these flags after the first nexthop: if there is a collision,
4143                  * we have already failed to add the first nexthop:
4144                  * fib6_add_rt2node() has rejected it; when replacing, old
4145                  * nexthops have been replaced by first new, the rest should
4146                  * be added to it.
4147                  */
4148                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4149                                                      NLM_F_REPLACE);
4150                 nhn++;
4151         }
4152
4153         /* success ... tell user about new route */
4154         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4155         goto cleanup;
4156
4157 add_errout:
4158         /* send notification for routes that were added so that
4159          * the delete notifications sent by ip6_route_del are
4160          * coherent
4161          */
4162         if (rt_notif)
4163                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4164
4165         /* Delete routes that were already added */
4166         list_for_each_entry(nh, &rt6_nh_list, next) {
4167                 if (err_nh == nh)
4168                         break;
4169                 ip6_route_del(&nh->r_cfg, extack);
4170         }
4171
4172 cleanup:
4173         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4174                 if (nh->rt6_info)
4175                         dst_release_immediate(&nh->rt6_info->dst);
4176                 kfree(nh->mxc.mx);
4177                 list_del(&nh->next);
4178                 kfree(nh);
4179         }
4180
4181         return err;
4182 }
4183
4184 static int ip6_route_multipath_del(struct fib6_config *cfg,
4185                                    struct netlink_ext_ack *extack)
4186 {
4187         struct fib6_config r_cfg;
4188         struct rtnexthop *rtnh;
4189         int remaining;
4190         int attrlen;
4191         int err = 1, last_err = 0;
4192
4193         remaining = cfg->fc_mp_len;
4194         rtnh = (struct rtnexthop *)cfg->fc_mp;
4195
4196         /* Parse a Multipath Entry */
4197         while (rtnh_ok(rtnh, remaining)) {
4198                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4199                 if (rtnh->rtnh_ifindex)
4200                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4201
4202                 attrlen = rtnh_attrlen(rtnh);
4203                 if (attrlen > 0) {
4204                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4205
4206                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4207                         if (nla) {
4208                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4209                                 r_cfg.fc_flags |= RTF_GATEWAY;
4210                         }
4211                 }
4212                 err = ip6_route_del(&r_cfg, extack);
4213                 if (err)
4214                         last_err = err;
4215
4216                 rtnh = rtnh_next(rtnh, &remaining);
4217         }
4218
4219         return last_err;
4220 }
4221
4222 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4223                               struct netlink_ext_ack *extack)
4224 {
4225         struct fib6_config cfg;
4226         int err;
4227
4228         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4229         if (err < 0)
4230                 return err;
4231
4232         if (cfg.fc_mp)
4233                 return ip6_route_multipath_del(&cfg, extack);
4234         else {
4235                 cfg.fc_delete_all_nh = 1;
4236                 return ip6_route_del(&cfg, extack);
4237         }
4238 }
4239
4240 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4241                               struct netlink_ext_ack *extack)
4242 {
4243         struct fib6_config cfg;
4244         int err;
4245
4246         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4247         if (err < 0)
4248                 return err;
4249
4250         if (cfg.fc_mp)
4251                 return ip6_route_multipath_add(&cfg, extack);
4252         else
4253                 return ip6_route_add(&cfg, extack);
4254 }
4255
4256 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4257 {
4258         int nexthop_len = 0;
4259
4260         if (rt->rt6i_nsiblings) {
4261                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4262                             + NLA_ALIGN(sizeof(struct rtnexthop))
4263                             + nla_total_size(16) /* RTA_GATEWAY */
4264                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
4265
4266                 nexthop_len *= rt->rt6i_nsiblings;
4267         }
4268
4269         return NLMSG_ALIGN(sizeof(struct rtmsg))
4270                + nla_total_size(16) /* RTA_SRC */
4271                + nla_total_size(16) /* RTA_DST */
4272                + nla_total_size(16) /* RTA_GATEWAY */
4273                + nla_total_size(16) /* RTA_PREFSRC */
4274                + nla_total_size(4) /* RTA_TABLE */
4275                + nla_total_size(4) /* RTA_IIF */
4276                + nla_total_size(4) /* RTA_OIF */
4277                + nla_total_size(4) /* RTA_PRIORITY */
4278                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4279                + nla_total_size(sizeof(struct rta_cacheinfo))
4280                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4281                + nla_total_size(1) /* RTA_PREF */
4282                + lwtunnel_get_encap_size(rt->dst.lwtstate)
4283                + nexthop_len;
4284 }
4285
4286 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4287                             unsigned int *flags, bool skip_oif)
4288 {
4289         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4290                 *flags |= RTNH_F_DEAD;
4291
4292         if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4293                 *flags |= RTNH_F_LINKDOWN;
4294                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4295                         *flags |= RTNH_F_DEAD;
4296         }
4297
4298         if (rt->rt6i_flags & RTF_GATEWAY) {
4299                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4300                         goto nla_put_failure;
4301         }
4302
4303         *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4304         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4305                 *flags |= RTNH_F_OFFLOAD;
4306
4307         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4308         if (!skip_oif && rt->dst.dev &&
4309             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4310                 goto nla_put_failure;
4311
4312         if (rt->dst.lwtstate &&
4313             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4314                 goto nla_put_failure;
4315
4316         return 0;
4317
4318 nla_put_failure:
4319         return -EMSGSIZE;
4320 }
4321
4322 /* add multipath next hop */
4323 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4324 {
4325         struct rtnexthop *rtnh;
4326         unsigned int flags = 0;
4327
4328         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4329         if (!rtnh)
4330                 goto nla_put_failure;
4331
4332         rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4333         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4334
4335         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4336                 goto nla_put_failure;
4337
4338         rtnh->rtnh_flags = flags;
4339
4340         /* length of rtnetlink header + attributes */
4341         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4342
4343         return 0;
4344
4345 nla_put_failure:
4346         return -EMSGSIZE;
4347 }
4348
4349 static int rt6_fill_node(struct net *net,
4350                          struct sk_buff *skb, struct rt6_info *rt,
4351                          struct in6_addr *dst, struct in6_addr *src,
4352                          int iif, int type, u32 portid, u32 seq,
4353                          unsigned int flags)
4354 {
4355         u32 metrics[RTAX_MAX];
4356         struct rtmsg *rtm;
4357         struct nlmsghdr *nlh;
4358         long expires;
4359         u32 table;
4360
4361         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4362         if (!nlh)
4363                 return -EMSGSIZE;
4364
4365         rtm = nlmsg_data(nlh);
4366         rtm->rtm_family = AF_INET6;
4367         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4368         rtm->rtm_src_len = rt->rt6i_src.plen;
4369         rtm->rtm_tos = 0;
4370         if (rt->rt6i_table)
4371                 table = rt->rt6i_table->tb6_id;
4372         else
4373                 table = RT6_TABLE_UNSPEC;
4374         rtm->rtm_table = table;
4375         if (nla_put_u32(skb, RTA_TABLE, table))
4376                 goto nla_put_failure;
4377         if (rt->rt6i_flags & RTF_REJECT) {
4378                 switch (rt->dst.error) {
4379                 case -EINVAL:
4380                         rtm->rtm_type = RTN_BLACKHOLE;
4381                         break;
4382                 case -EACCES:
4383                         rtm->rtm_type = RTN_PROHIBIT;
4384                         break;
4385                 case -EAGAIN:
4386                         rtm->rtm_type = RTN_THROW;
4387                         break;
4388                 default:
4389                         rtm->rtm_type = RTN_UNREACHABLE;
4390                         break;
4391                 }
4392         }
4393         else if (rt->rt6i_flags & RTF_LOCAL)
4394                 rtm->rtm_type = RTN_LOCAL;
4395         else if (rt->rt6i_flags & RTF_ANYCAST)
4396                 rtm->rtm_type = RTN_ANYCAST;
4397         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4398                 rtm->rtm_type = RTN_LOCAL;
4399         else
4400                 rtm->rtm_type = RTN_UNICAST;
4401         rtm->rtm_flags = 0;
4402         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4403         rtm->rtm_protocol = rt->rt6i_protocol;
4404
4405         if (rt->rt6i_flags & RTF_CACHE)
4406                 rtm->rtm_flags |= RTM_F_CLONED;
4407
4408         if (dst) {
4409                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4410                         goto nla_put_failure;
4411                 rtm->rtm_dst_len = 128;
4412         } else if (rtm->rtm_dst_len)
4413                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4414                         goto nla_put_failure;
4415 #ifdef CONFIG_IPV6_SUBTREES
4416         if (src) {
4417                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4418                         goto nla_put_failure;
4419                 rtm->rtm_src_len = 128;
4420         } else if (rtm->rtm_src_len &&
4421                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4422                 goto nla_put_failure;
4423 #endif
4424         if (iif) {
4425 #ifdef CONFIG_IPV6_MROUTE
4426                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4427                         int err = ip6mr_get_route(net, skb, rtm, portid);
4428
4429                         if (err == 0)
4430                                 return 0;
4431                         if (err < 0)
4432                                 goto nla_put_failure;
4433                 } else
4434 #endif
4435                         if (nla_put_u32(skb, RTA_IIF, iif))
4436                                 goto nla_put_failure;
4437         } else if (dst) {
4438                 struct in6_addr saddr_buf;
4439                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4440                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4441                         goto nla_put_failure;
4442         }
4443
4444         if (rt->rt6i_prefsrc.plen) {
4445                 struct in6_addr saddr_buf;
4446                 saddr_buf = rt->rt6i_prefsrc.addr;
4447                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4448                         goto nla_put_failure;
4449         }
4450
4451         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4452         if (rt->rt6i_pmtu)
4453                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4454         if (rtnetlink_put_metrics(skb, metrics) < 0)
4455                 goto nla_put_failure;
4456
4457         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4458                 goto nla_put_failure;
4459
4460         /* For multipath routes, walk the siblings list and add
4461          * each as a nexthop within RTA_MULTIPATH.
4462          */
4463         if (rt->rt6i_nsiblings) {
4464                 struct rt6_info *sibling, *next_sibling;
4465                 struct nlattr *mp;
4466
4467                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4468                 if (!mp)
4469                         goto nla_put_failure;
4470
4471                 if (rt6_add_nexthop(skb, rt) < 0)
4472                         goto nla_put_failure;
4473
4474                 list_for_each_entry_safe(sibling, next_sibling,
4475                                          &rt->rt6i_siblings, rt6i_siblings) {
4476                         if (rt6_add_nexthop(skb, sibling) < 0)
4477                                 goto nla_put_failure;
4478                 }
4479
4480                 nla_nest_end(skb, mp);
4481         } else {
4482                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4483                         goto nla_put_failure;
4484         }
4485
4486         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4487
4488         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4489                 goto nla_put_failure;
4490
4491         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4492                 goto nla_put_failure;
4493
4494
4495         nlmsg_end(skb, nlh);
4496         return 0;
4497
4498 nla_put_failure:
4499         nlmsg_cancel(skb, nlh);
4500         return -EMSGSIZE;
4501 }
4502
4503 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4504 {
4505         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4506         struct net *net = arg->net;
4507
4508         if (rt == net->ipv6.ip6_null_entry)
4509                 return 0;
4510
4511         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4512                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4513
4514                 /* user wants prefix routes only */
4515                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4516                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4517                         /* success since this is not a prefix route */
4518                         return 1;
4519                 }
4520         }
4521
4522         return rt6_fill_node(net,
4523                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4524                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4525                      NLM_F_MULTI);
4526 }
4527
4528 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4529                               struct netlink_ext_ack *extack)
4530 {
4531         struct net *net = sock_net(in_skb->sk);
4532         struct nlattr *tb[RTA_MAX+1];
4533         int err, iif = 0, oif = 0;
4534         struct dst_entry *dst;
4535         struct rt6_info *rt;
4536         struct sk_buff *skb;
4537         struct rtmsg *rtm;
4538         struct flowi6 fl6;
4539         bool fibmatch;
4540
4541         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4542                           extack);
4543         if (err < 0)
4544                 goto errout;
4545
4546         err = -EINVAL;
4547         memset(&fl6, 0, sizeof(fl6));
4548         rtm = nlmsg_data(nlh);
4549         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4550         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4551
4552         if (tb[RTA_SRC]) {
4553                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4554                         goto errout;
4555
4556                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4557         }
4558
4559         if (tb[RTA_DST]) {
4560                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4561                         goto errout;
4562
4563                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4564         }
4565
4566         if (tb[RTA_IIF])
4567                 iif = nla_get_u32(tb[RTA_IIF]);
4568
4569         if (tb[RTA_OIF])
4570                 oif = nla_get_u32(tb[RTA_OIF]);
4571
4572         if (tb[RTA_MARK])
4573                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4574
4575         if (tb[RTA_UID])
4576                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4577                                            nla_get_u32(tb[RTA_UID]));
4578         else
4579                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4580
4581         if (iif) {
4582                 struct net_device *dev;
4583                 int flags = 0;
4584
4585                 rcu_read_lock();
4586
4587                 dev = dev_get_by_index_rcu(net, iif);
4588                 if (!dev) {
4589                         rcu_read_unlock();
4590                         err = -ENODEV;
4591                         goto errout;
4592                 }
4593
4594                 fl6.flowi6_iif = iif;
4595
4596                 if (!ipv6_addr_any(&fl6.saddr))
4597                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4598
4599                 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4600
4601                 rcu_read_unlock();
4602         } else {
4603                 fl6.flowi6_oif = oif;
4604
4605                 dst = ip6_route_output(net, NULL, &fl6);
4606         }
4607
4608
4609         rt = container_of(dst, struct rt6_info, dst);
4610         if (rt->dst.error) {
4611                 err = rt->dst.error;
4612                 ip6_rt_put(rt);
4613                 goto errout;
4614         }
4615
4616         if (rt == net->ipv6.ip6_null_entry) {
4617                 err = rt->dst.error;
4618                 ip6_rt_put(rt);
4619                 goto errout;
4620         }
4621
4622         if (fibmatch && rt->from) {
4623                 struct rt6_info *ort = rt->from;
4624
4625                 dst_hold(&ort->dst);
4626                 ip6_rt_put(rt);
4627                 rt = ort;
4628         }
4629
4630         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4631         if (!skb) {
4632                 ip6_rt_put(rt);
4633                 err = -ENOBUFS;
4634                 goto errout;
4635         }
4636
4637         skb_dst_set(skb, &rt->dst);
4638         if (fibmatch)
4639                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4640                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4641                                     nlh->nlmsg_seq, 0);
4642         else
4643                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4644                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4645                                     nlh->nlmsg_seq, 0);
4646         if (err < 0) {
4647                 kfree_skb(skb);
4648                 goto errout;
4649         }
4650
4651         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4652 errout:
4653         return err;
4654 }
4655
4656 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4657                      unsigned int nlm_flags)
4658 {
4659         struct sk_buff *skb;
4660         struct net *net = info->nl_net;
4661         u32 seq;
4662         int err;
4663
4664         err = -ENOBUFS;
4665         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4666
4667         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4668         if (!skb)
4669                 goto errout;
4670
4671         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4672                                 event, info->portid, seq, nlm_flags);
4673         if (err < 0) {
4674                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4675                 WARN_ON(err == -EMSGSIZE);
4676                 kfree_skb(skb);
4677                 goto errout;
4678         }
4679         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4680                     info->nlh, gfp_any());
4681         return;
4682 errout:
4683         if (err < 0)
4684                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4685 }
4686
4687 static int ip6_route_dev_notify(struct notifier_block *this,
4688                                 unsigned long event, void *ptr)
4689 {
4690         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4691         struct net *net = dev_net(dev);
4692
4693         if (!(dev->flags & IFF_LOOPBACK))
4694                 return NOTIFY_OK;
4695
4696         if (event == NETDEV_REGISTER) {
4697                 net->ipv6.ip6_null_entry->dst.dev = dev;
4698                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4699 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4700                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4701                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4702                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4703                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4704 #endif
4705          } else if (event == NETDEV_UNREGISTER &&
4706                     dev->reg_state != NETREG_UNREGISTERED) {
4707                 /* NETDEV_UNREGISTER could be fired for multiple times by
4708                  * netdev_wait_allrefs(). Make sure we only call this once.
4709                  */
4710                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4711 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4712                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4713                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4714 #endif
4715         }
4716
4717         return NOTIFY_OK;
4718 }
4719
4720 /*
4721  *      /proc
4722  */
4723
4724 #ifdef CONFIG_PROC_FS
4725
4726 static const struct file_operations ipv6_route_proc_fops = {
4727         .open           = ipv6_route_open,
4728         .read           = seq_read,
4729         .llseek         = seq_lseek,
4730         .release        = seq_release_net,
4731 };
4732
4733 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4734 {
4735         struct net *net = (struct net *)seq->private;
4736         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4737                    net->ipv6.rt6_stats->fib_nodes,
4738                    net->ipv6.rt6_stats->fib_route_nodes,
4739                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4740                    net->ipv6.rt6_stats->fib_rt_entries,
4741                    net->ipv6.rt6_stats->fib_rt_cache,
4742                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4743                    net->ipv6.rt6_stats->fib_discarded_routes);
4744
4745         return 0;
4746 }
4747
4748 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4749 {
4750         return single_open_net(inode, file, rt6_stats_seq_show);
4751 }
4752
4753 static const struct file_operations rt6_stats_seq_fops = {
4754         .open    = rt6_stats_seq_open,
4755         .read    = seq_read,
4756         .llseek  = seq_lseek,
4757         .release = single_release_net,
4758 };
4759 #endif  /* CONFIG_PROC_FS */
4760
4761 #ifdef CONFIG_SYSCTL
4762
4763 static
4764 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4765                               void __user *buffer, size_t *lenp, loff_t *ppos)
4766 {
4767         struct net *net;
4768         int delay;
4769         if (!write)
4770                 return -EINVAL;
4771
4772         net = (struct net *)ctl->extra1;
4773         delay = net->ipv6.sysctl.flush_delay;
4774         proc_dointvec(ctl, write, buffer, lenp, ppos);
4775         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4776         return 0;
4777 }
4778
4779 struct ctl_table ipv6_route_table_template[] = {
4780         {
4781                 .procname       =       "flush",
4782                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4783                 .maxlen         =       sizeof(int),
4784                 .mode           =       0200,
4785                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4786         },
4787         {
4788                 .procname       =       "gc_thresh",
4789                 .data           =       &ip6_dst_ops_template.gc_thresh,
4790                 .maxlen         =       sizeof(int),
4791                 .mode           =       0644,
4792                 .proc_handler   =       proc_dointvec,
4793         },
4794         {
4795                 .procname       =       "max_size",
4796                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4797                 .maxlen         =       sizeof(int),
4798                 .mode           =       0644,
4799                 .proc_handler   =       proc_dointvec,
4800         },
4801         {
4802                 .procname       =       "gc_min_interval",
4803                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4804                 .maxlen         =       sizeof(int),
4805                 .mode           =       0644,
4806                 .proc_handler   =       proc_dointvec_jiffies,
4807         },
4808         {
4809                 .procname       =       "gc_timeout",
4810                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4811                 .maxlen         =       sizeof(int),
4812                 .mode           =       0644,
4813                 .proc_handler   =       proc_dointvec_jiffies,
4814         },
4815         {
4816                 .procname       =       "gc_interval",
4817                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4818                 .maxlen         =       sizeof(int),
4819                 .mode           =       0644,
4820                 .proc_handler   =       proc_dointvec_jiffies,
4821         },
4822         {
4823                 .procname       =       "gc_elasticity",
4824                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4825                 .maxlen         =       sizeof(int),
4826                 .mode           =       0644,
4827                 .proc_handler   =       proc_dointvec,
4828         },
4829         {
4830                 .procname       =       "mtu_expires",
4831                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4832                 .maxlen         =       sizeof(int),
4833                 .mode           =       0644,
4834                 .proc_handler   =       proc_dointvec_jiffies,
4835         },
4836         {
4837                 .procname       =       "min_adv_mss",
4838                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4839                 .maxlen         =       sizeof(int),
4840                 .mode           =       0644,
4841                 .proc_handler   =       proc_dointvec,
4842         },
4843         {
4844                 .procname       =       "gc_min_interval_ms",
4845                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4846                 .maxlen         =       sizeof(int),
4847                 .mode           =       0644,
4848                 .proc_handler   =       proc_dointvec_ms_jiffies,
4849         },
4850         { }
4851 };
4852
4853 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4854 {
4855         struct ctl_table *table;
4856
4857         table = kmemdup(ipv6_route_table_template,
4858                         sizeof(ipv6_route_table_template),
4859                         GFP_KERNEL);
4860
4861         if (table) {
4862                 table[0].data = &net->ipv6.sysctl.flush_delay;
4863                 table[0].extra1 = net;
4864                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4865                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4866                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4867                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4868                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4869                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4870                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4871                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4872                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4873
4874                 /* Don't export sysctls to unprivileged users */
4875                 if (net->user_ns != &init_user_ns)
4876                         table[0].procname = NULL;
4877         }
4878
4879         return table;
4880 }
4881 #endif
4882
4883 static int __net_init ip6_route_net_init(struct net *net)
4884 {
4885         int ret = -ENOMEM;
4886
4887         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4888                sizeof(net->ipv6.ip6_dst_ops));
4889
4890         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4891                 goto out_ip6_dst_ops;
4892
4893         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4894                                            sizeof(*net->ipv6.ip6_null_entry),
4895                                            GFP_KERNEL);
4896         if (!net->ipv6.ip6_null_entry)
4897                 goto out_ip6_dst_entries;
4898         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4899         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4900                          ip6_template_metrics, true);
4901
4902 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4903         net->ipv6.fib6_has_custom_rules = false;
4904         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4905                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4906                                                GFP_KERNEL);
4907         if (!net->ipv6.ip6_prohibit_entry)
4908                 goto out_ip6_null_entry;
4909         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4910         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4911                          ip6_template_metrics, true);
4912
4913         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4914                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4915                                                GFP_KERNEL);
4916         if (!net->ipv6.ip6_blk_hole_entry)
4917                 goto out_ip6_prohibit_entry;
4918         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4919         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4920                          ip6_template_metrics, true);
4921 #endif
4922
4923         net->ipv6.sysctl.flush_delay = 0;
4924         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4925         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4926         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4927         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4928         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4929         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4930         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4931
4932         net->ipv6.ip6_rt_gc_expire = 30*HZ;
4933
4934         ret = 0;
4935 out:
4936         return ret;
4937
4938 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4939 out_ip6_prohibit_entry:
4940         kfree(net->ipv6.ip6_prohibit_entry);
4941 out_ip6_null_entry:
4942         kfree(net->ipv6.ip6_null_entry);
4943 #endif
4944 out_ip6_dst_entries:
4945         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4946 out_ip6_dst_ops:
4947         goto out;
4948 }
4949
4950 static void __net_exit ip6_route_net_exit(struct net *net)
4951 {
4952         kfree(net->ipv6.ip6_null_entry);
4953 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4954         kfree(net->ipv6.ip6_prohibit_entry);
4955         kfree(net->ipv6.ip6_blk_hole_entry);
4956 #endif
4957         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4958 }
4959
4960 static int __net_init ip6_route_net_init_late(struct net *net)
4961 {
4962 #ifdef CONFIG_PROC_FS
4963         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4964         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4965 #endif
4966         return 0;
4967 }
4968
4969 static void __net_exit ip6_route_net_exit_late(struct net *net)
4970 {
4971 #ifdef CONFIG_PROC_FS
4972         remove_proc_entry("ipv6_route", net->proc_net);
4973         remove_proc_entry("rt6_stats", net->proc_net);
4974 #endif
4975 }
4976
4977 static struct pernet_operations ip6_route_net_ops = {
4978         .init = ip6_route_net_init,
4979         .exit = ip6_route_net_exit,
4980 };
4981
4982 static int __net_init ipv6_inetpeer_init(struct net *net)
4983 {
4984         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4985
4986         if (!bp)
4987                 return -ENOMEM;
4988         inet_peer_base_init(bp);
4989         net->ipv6.peers = bp;
4990         return 0;
4991 }
4992
4993 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4994 {
4995         struct inet_peer_base *bp = net->ipv6.peers;
4996
4997         net->ipv6.peers = NULL;
4998         inetpeer_invalidate_tree(bp);
4999         kfree(bp);
5000 }
5001
5002 static struct pernet_operations ipv6_inetpeer_ops = {
5003         .init   =       ipv6_inetpeer_init,
5004         .exit   =       ipv6_inetpeer_exit,
5005 };
5006
5007 static struct pernet_operations ip6_route_net_late_ops = {
5008         .init = ip6_route_net_init_late,
5009         .exit = ip6_route_net_exit_late,
5010 };
5011
5012 static struct notifier_block ip6_route_dev_notifier = {
5013         .notifier_call = ip6_route_dev_notify,
5014         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5015 };
5016
5017 void __init ip6_route_init_special_entries(void)
5018 {
5019         /* Registering of the loopback is done before this portion of code,
5020          * the loopback reference in rt6_info will not be taken, do it
5021          * manually for init_net */
5022         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5023         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5024   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5025         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5026         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5027         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5028         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5029   #endif
5030 }
5031
5032 int __init ip6_route_init(void)
5033 {
5034         int ret;
5035         int cpu;
5036
5037         ret = -ENOMEM;
5038         ip6_dst_ops_template.kmem_cachep =
5039                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5040                                   SLAB_HWCACHE_ALIGN, NULL);
5041         if (!ip6_dst_ops_template.kmem_cachep)
5042                 goto out;
5043
5044         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5045         if (ret)
5046                 goto out_kmem_cache;
5047
5048         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5049         if (ret)
5050                 goto out_dst_entries;
5051
5052         ret = register_pernet_subsys(&ip6_route_net_ops);
5053         if (ret)
5054                 goto out_register_inetpeer;
5055
5056         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5057
5058         ret = fib6_init();
5059         if (ret)
5060                 goto out_register_subsys;
5061
5062         ret = xfrm6_init();
5063         if (ret)
5064                 goto out_fib6_init;
5065
5066         ret = fib6_rules_init();
5067         if (ret)
5068                 goto xfrm6_init;
5069
5070         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5071         if (ret)
5072                 goto fib6_rules_init;
5073
5074         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5075                                    inet6_rtm_newroute, NULL, 0);
5076         if (ret < 0)
5077                 goto out_register_late_subsys;
5078
5079         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5080                                    inet6_rtm_delroute, NULL, 0);
5081         if (ret < 0)
5082                 goto out_register_late_subsys;
5083
5084         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5085                                    inet6_rtm_getroute, NULL,
5086                                    RTNL_FLAG_DOIT_UNLOCKED);
5087         if (ret < 0)
5088                 goto out_register_late_subsys;
5089
5090         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5091         if (ret)
5092                 goto out_register_late_subsys;
5093
5094         for_each_possible_cpu(cpu) {
5095                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5096
5097                 INIT_LIST_HEAD(&ul->head);
5098                 spin_lock_init(&ul->lock);
5099         }
5100
5101 out:
5102         return ret;
5103
5104 out_register_late_subsys:
5105         rtnl_unregister_all(PF_INET6);
5106         unregister_pernet_subsys(&ip6_route_net_late_ops);
5107 fib6_rules_init:
5108         fib6_rules_cleanup();
5109 xfrm6_init:
5110         xfrm6_fini();
5111 out_fib6_init:
5112         fib6_gc_cleanup();
5113 out_register_subsys:
5114         unregister_pernet_subsys(&ip6_route_net_ops);
5115 out_register_inetpeer:
5116         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5117 out_dst_entries:
5118         dst_entries_destroy(&ip6_dst_blackhole_ops);
5119 out_kmem_cache:
5120         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5121         goto out;
5122 }
5123
5124 void ip6_route_cleanup(void)
5125 {
5126         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5127         unregister_pernet_subsys(&ip6_route_net_late_ops);
5128         fib6_rules_cleanup();
5129         xfrm6_fini();
5130         fib6_gc_cleanup();
5131         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5132         unregister_pernet_subsys(&ip6_route_net_ops);
5133         dst_entries_destroy(&ip6_dst_blackhole_ops);
5134         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5135 }
This page took 0.344536 seconds and 4 git commands to generate.