]> Git Repo - linux.git/blob - net/ipv6/route.c
soc: qcom: Add GENI based QUP Wrapper driver
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <[email protected]>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454                                              struct rt6_info *match,
455                                              struct flowi6 *fl6, int oif,
456                                              const struct sk_buff *skb,
457                                              int strict)
458 {
459         struct rt6_info *sibling, *next_sibling;
460
461         /* We might have already computed the hash for ICMPv6 errors. In such
462          * case it will always be non-zero. Otherwise now is the time to do it.
463          */
464         if (!fl6->mp_hash)
465                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466
467         if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468                 return match;
469
470         list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471                                  rt6i_siblings) {
472                 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473                         continue;
474                 if (rt6_score_route(sibling, oif, strict) < 0)
475                         break;
476                 match = sibling;
477                 break;
478         }
479
480         return match;
481 }
482
483 /*
484  *      Route lookup. rcu_read_lock() should be held.
485  */
486
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488                                                     struct rt6_info *rt,
489                                                     const struct in6_addr *saddr,
490                                                     int oif,
491                                                     int flags)
492 {
493         struct rt6_info *local = NULL;
494         struct rt6_info *sprt;
495
496         if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497                 return rt;
498
499         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500                 struct net_device *dev = sprt->dst.dev;
501
502                 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503                         continue;
504
505                 if (oif) {
506                         if (dev->ifindex == oif)
507                                 return sprt;
508                         if (dev->flags & IFF_LOOPBACK) {
509                                 if (!sprt->rt6i_idev ||
510                                     sprt->rt6i_idev->dev->ifindex != oif) {
511                                         if (flags & RT6_LOOKUP_F_IFACE)
512                                                 continue;
513                                         if (local &&
514                                             local->rt6i_idev->dev->ifindex == oif)
515                                                 continue;
516                                 }
517                                 local = sprt;
518                         }
519                 } else {
520                         if (ipv6_chk_addr(net, saddr, dev,
521                                           flags & RT6_LOOKUP_F_IFACE))
522                                 return sprt;
523                 }
524         }
525
526         if (oif) {
527                 if (local)
528                         return local;
529
530                 if (flags & RT6_LOOKUP_F_IFACE)
531                         return net->ipv6.ip6_null_entry;
532         }
533
534         return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 }
536
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539         struct work_struct work;
540         struct in6_addr target;
541         struct net_device *dev;
542 };
543
544 static void rt6_probe_deferred(struct work_struct *w)
545 {
546         struct in6_addr mcaddr;
547         struct __rt6_probe_work *work =
548                 container_of(w, struct __rt6_probe_work, work);
549
550         addrconf_addr_solict_mult(&work->target, &mcaddr);
551         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552         dev_put(work->dev);
553         kfree(work);
554 }
555
556 static void rt6_probe(struct rt6_info *rt)
557 {
558         struct __rt6_probe_work *work;
559         struct neighbour *neigh;
560         /*
561          * Okay, this does not seem to be appropriate
562          * for now, however, we need to check if it
563          * is really so; aka Router Reachability Probing.
564          *
565          * Router Reachability Probe MUST be rate-limited
566          * to no more than one per minute.
567          */
568         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569                 return;
570         rcu_read_lock_bh();
571         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 work = NULL;
577                 write_lock(&neigh->lock);
578                 if (!(neigh->nud_state & NUD_VALID) &&
579                     time_after(jiffies,
580                                neigh->updated +
581                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
582                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
583                         if (work)
584                                 __neigh_set_probe_once(neigh);
585                 }
586                 write_unlock(&neigh->lock);
587         } else {
588                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
589         }
590
591         if (work) {
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = rt->rt6i_gateway;
594                 dev_hold(rt->dst.dev);
595                 work->dev = rt->dst.dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct rt6_info *rt)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
612 {
613         struct net_device *dev = rt->dst.dev;
614         if (!oif || dev->ifindex == oif)
615                 return 2;
616         if ((dev->flags & IFF_LOOPBACK) &&
617             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618                 return 1;
619         return 0;
620 }
621
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
623 {
624         struct neighbour *neigh;
625         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
626
627         if (rt->rt6i_flags & RTF_NONEXTHOP ||
628             !(rt->rt6i_flags & RTF_GATEWAY))
629                 return RT6_NUD_SUCCEED;
630
631         rcu_read_lock_bh();
632         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633         if (neigh) {
634                 read_lock(&neigh->lock);
635                 if (neigh->nud_state & NUD_VALID)
636                         ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638                 else if (!(neigh->nud_state & NUD_FAILED))
639                         ret = RT6_NUD_SUCCEED;
640                 else
641                         ret = RT6_NUD_FAIL_PROBE;
642 #endif
643                 read_unlock(&neigh->lock);
644         } else {
645                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
647         }
648         rcu_read_unlock_bh();
649
650         return ret;
651 }
652
653 static int rt6_score_route(struct rt6_info *rt, int oif,
654                            int strict)
655 {
656         int m;
657
658         m = rt6_check_dev(rt, oif);
659         if (!m && (strict & RT6_LOOKUP_F_IFACE))
660                 return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663 #endif
664         if (strict & RT6_LOOKUP_F_REACHABLE) {
665                 int n = rt6_check_neigh(rt);
666                 if (n < 0)
667                         return n;
668         }
669         return m;
670 }
671
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673                                    int *mpri, struct rt6_info *match,
674                                    bool *do_rr)
675 {
676         int m;
677         bool match_do_rr = false;
678         struct inet6_dev *idev = rt->rt6i_idev;
679
680         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681                 goto out;
682
683         if (idev->cnf.ignore_routes_with_linkdown &&
684             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686                 goto out;
687
688         if (rt6_check_expired(rt))
689                 goto out;
690
691         m = rt6_score_route(rt, oif, strict);
692         if (m == RT6_NUD_FAIL_DO_RR) {
693                 match_do_rr = true;
694                 m = 0; /* lowest valid score */
695         } else if (m == RT6_NUD_FAIL_HARD) {
696                 goto out;
697         }
698
699         if (strict & RT6_LOOKUP_F_REACHABLE)
700                 rt6_probe(rt);
701
702         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
703         if (m > *mpri) {
704                 *do_rr = match_do_rr;
705                 *mpri = m;
706                 match = rt;
707         }
708 out:
709         return match;
710 }
711
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713                                      struct rt6_info *leaf,
714                                      struct rt6_info *rr_head,
715                                      u32 metric, int oif, int strict,
716                                      bool *do_rr)
717 {
718         struct rt6_info *rt, *match, *cont;
719         int mpri = -1;
720
721         match = NULL;
722         cont = NULL;
723         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724                 if (rt->rt6i_metric != metric) {
725                         cont = rt;
726                         break;
727                 }
728
729                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730         }
731
732         for (rt = leaf; rt && rt != rr_head;
733              rt = rcu_dereference(rt->rt6_next)) {
734                 if (rt->rt6i_metric != metric) {
735                         cont = rt;
736                         break;
737                 }
738
739                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740         }
741
742         if (match || !cont)
743                 return match;
744
745         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747
748         return match;
749 }
750
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752                                    int oif, int strict)
753 {
754         struct rt6_info *leaf = rcu_dereference(fn->leaf);
755         struct rt6_info *match, *rt0;
756         bool do_rr = false;
757         int key_plen;
758
759         if (!leaf || leaf == net->ipv6.ip6_null_entry)
760                 return net->ipv6.ip6_null_entry;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->rt6i_src.plen)
774                 key_plen = rt0->rt6i_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 return net->ipv6.ip6_null_entry;
778
779         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780                              &do_rr);
781
782         if (do_rr) {
783                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784
785                 /* no entries matched; do round-robin */
786                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
787                         next = leaf;
788
789                 if (next != rt0) {
790                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791                         /* make sure next is not being deleted from the tree */
792                         if (next->rt6i_node)
793                                 rcu_assign_pointer(fn->rr_ptr, next);
794                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795                 }
796         }
797
798         return match ? match : net->ipv6.ip6_null_entry;
799 }
800
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808                   const struct in6_addr *gwaddr)
809 {
810         struct net *net = dev_net(dev);
811         struct route_info *rinfo = (struct route_info *) opt;
812         struct in6_addr prefix_buf, *prefix;
813         unsigned int pref;
814         unsigned long lifetime;
815         struct rt6_info *rt;
816
817         if (len < sizeof(struct route_info)) {
818                 return -EINVAL;
819         }
820
821         /* Sanity check for prefix_len and length */
822         if (rinfo->length > 3) {
823                 return -EINVAL;
824         } else if (rinfo->prefix_len > 128) {
825                 return -EINVAL;
826         } else if (rinfo->prefix_len > 64) {
827                 if (rinfo->length < 2) {
828                         return -EINVAL;
829                 }
830         } else if (rinfo->prefix_len > 0) {
831                 if (rinfo->length < 1) {
832                         return -EINVAL;
833                 }
834         }
835
836         pref = rinfo->route_pref;
837         if (pref == ICMPV6_ROUTER_PREF_INVALID)
838                 return -EINVAL;
839
840         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841
842         if (rinfo->length == 3)
843                 prefix = (struct in6_addr *)rinfo->prefix;
844         else {
845                 /* this function is safe */
846                 ipv6_addr_prefix(&prefix_buf,
847                                  (struct in6_addr *)rinfo->prefix,
848                                  rinfo->prefix_len);
849                 prefix = &prefix_buf;
850         }
851
852         if (rinfo->prefix_len == 0)
853                 rt = rt6_get_dflt_router(gwaddr, dev);
854         else
855                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856                                         gwaddr, dev);
857
858         if (rt && !lifetime) {
859                 ip6_del_rt(rt);
860                 rt = NULL;
861         }
862
863         if (!rt && lifetime)
864                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865                                         dev, pref);
866         else if (rt)
867                 rt->rt6i_flags = RTF_ROUTEINFO |
868                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869
870         if (rt) {
871                 if (!addrconf_finite_timeout(lifetime))
872                         rt6_clean_expires(rt);
873                 else
874                         rt6_set_expires(rt, jiffies + HZ * lifetime);
875
876                 ip6_rt_put(rt);
877         }
878         return 0;
879 }
880 #endif
881
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883                                         struct in6_addr *saddr)
884 {
885         struct fib6_node *pn, *sn;
886         while (1) {
887                 if (fn->fn_flags & RTN_TL_ROOT)
888                         return NULL;
889                 pn = rcu_dereference(fn->parent);
890                 sn = FIB6_SUBTREE(pn);
891                 if (sn && sn != fn)
892                         fn = fib6_lookup(sn, NULL, saddr);
893                 else
894                         fn = pn;
895                 if (fn->fn_flags & RTN_RTINFO)
896                         return fn;
897         }
898 }
899
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901                           bool null_fallback)
902 {
903         struct rt6_info *rt = *prt;
904
905         if (dst_hold_safe(&rt->dst))
906                 return true;
907         if (null_fallback) {
908                 rt = net->ipv6.ip6_null_entry;
909                 dst_hold(&rt->dst);
910         } else {
911                 rt = NULL;
912         }
913         *prt = rt;
914         return false;
915 }
916
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918                                              struct fib6_table *table,
919                                              struct flowi6 *fl6,
920                                              const struct sk_buff *skb,
921                                              int flags)
922 {
923         struct rt6_info *rt, *rt_cache;
924         struct fib6_node *fn;
925
926         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927                 flags &= ~RT6_LOOKUP_F_IFACE;
928
929         rcu_read_lock();
930         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
931 restart:
932         rt = rcu_dereference(fn->leaf);
933         if (!rt) {
934                 rt = net->ipv6.ip6_null_entry;
935         } else {
936                 rt = rt6_device_match(net, rt, &fl6->saddr,
937                                       fl6->flowi6_oif, flags);
938                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
939                         rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
940                                                   skb, flags);
941         }
942         if (rt == net->ipv6.ip6_null_entry) {
943                 fn = fib6_backtrack(fn, &fl6->saddr);
944                 if (fn)
945                         goto restart;
946         }
947         /* Search through exception table */
948         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949         if (rt_cache)
950                 rt = rt_cache;
951
952         if (ip6_hold_safe(net, &rt, true))
953                 dst_use_noref(&rt->dst, jiffies);
954
955         rcu_read_unlock();
956
957         trace_fib6_table_lookup(net, rt, table, fl6);
958
959         return rt;
960
961 }
962
963 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
964                                    const struct sk_buff *skb, int flags)
965 {
966         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
967 }
968 EXPORT_SYMBOL_GPL(ip6_route_lookup);
969
970 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
971                             const struct in6_addr *saddr, int oif,
972                             const struct sk_buff *skb, int strict)
973 {
974         struct flowi6 fl6 = {
975                 .flowi6_oif = oif,
976                 .daddr = *daddr,
977         };
978         struct dst_entry *dst;
979         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
980
981         if (saddr) {
982                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983                 flags |= RT6_LOOKUP_F_HAS_SADDR;
984         }
985
986         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
987         if (dst->error == 0)
988                 return (struct rt6_info *) dst;
989
990         dst_release(dst);
991
992         return NULL;
993 }
994 EXPORT_SYMBOL(rt6_lookup);
995
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997  * It takes new route entry, the addition fails by any reason the
998  * route is released.
999  * Caller must hold dst before calling it.
1000  */
1001
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003                         struct mx6_config *mxc,
1004                         struct netlink_ext_ack *extack)
1005 {
1006         int err;
1007         struct fib6_table *table;
1008
1009         table = rt->rt6i_table;
1010         spin_lock_bh(&table->tb6_lock);
1011         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012         spin_unlock_bh(&table->tb6_lock);
1013
1014         return err;
1015 }
1016
1017 int ip6_ins_rt(struct rt6_info *rt)
1018 {
1019         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020         struct mx6_config mxc = { .mx = NULL, };
1021
1022         /* Hold dst to account for the reference from the fib6 tree */
1023         dst_hold(&rt->dst);
1024         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025 }
1026
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029 {
1030         struct net_device *dev = rt->dst.dev;
1031
1032         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1033                 /* for copies of local routes, dst->dev needs to be the
1034                  * device if it is a master device, the master device if
1035                  * device is enslaved, and the loopback as the default
1036                  */
1037                 if (netif_is_l3_slave(dev) &&
1038                     !rt6_need_strict(&rt->rt6i_dst.addr))
1039                         dev = l3mdev_master_dev_rcu(dev);
1040                 else if (!netif_is_l3_master(dev))
1041                         dev = dev_net(dev)->loopback_dev;
1042                 /* last case is netif_is_l3_master(dev) is true in which
1043                  * case we want dev returned to be dev
1044                  */
1045         }
1046
1047         return dev;
1048 }
1049
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051                                            const struct in6_addr *daddr,
1052                                            const struct in6_addr *saddr)
1053 {
1054         struct net_device *dev;
1055         struct rt6_info *rt;
1056
1057         /*
1058          *      Clone the route.
1059          */
1060
1061         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062                 ort = ort->from;
1063
1064         rcu_read_lock();
1065         dev = ip6_rt_get_dev_rcu(ort);
1066         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067         rcu_read_unlock();
1068         if (!rt)
1069                 return NULL;
1070
1071         ip6_rt_copy_init(rt, ort);
1072         rt->rt6i_flags |= RTF_CACHE;
1073         rt->rt6i_metric = 0;
1074         rt->dst.flags |= DST_HOST;
1075         rt->rt6i_dst.addr = *daddr;
1076         rt->rt6i_dst.plen = 128;
1077
1078         if (!rt6_is_gw_or_nonexthop(ort)) {
1079                 if (ort->rt6i_dst.plen != 128 &&
1080                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081                         rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083                 if (rt->rt6i_src.plen && saddr) {
1084                         rt->rt6i_src.addr = *saddr;
1085                         rt->rt6i_src.plen = 128;
1086                 }
1087 #endif
1088         }
1089
1090         return rt;
1091 }
1092
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094 {
1095         struct net_device *dev;
1096         struct rt6_info *pcpu_rt;
1097
1098         rcu_read_lock();
1099         dev = ip6_rt_get_dev_rcu(rt);
1100         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101         rcu_read_unlock();
1102         if (!pcpu_rt)
1103                 return NULL;
1104         ip6_rt_copy_init(pcpu_rt, rt);
1105         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106         pcpu_rt->rt6i_flags |= RTF_PCPU;
1107         return pcpu_rt;
1108 }
1109
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112 {
1113         struct rt6_info *pcpu_rt, **p;
1114
1115         p = this_cpu_ptr(rt->rt6i_pcpu);
1116         pcpu_rt = *p;
1117
1118         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119                 rt6_dst_from_metrics_check(pcpu_rt);
1120
1121         return pcpu_rt;
1122 }
1123
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125 {
1126         struct rt6_info *pcpu_rt, *prev, **p;
1127
1128         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129         if (!pcpu_rt) {
1130                 struct net *net = dev_net(rt->dst.dev);
1131
1132                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133                 return net->ipv6.ip6_null_entry;
1134         }
1135
1136         dst_hold(&pcpu_rt->dst);
1137         p = this_cpu_ptr(rt->rt6i_pcpu);
1138         prev = cmpxchg(p, NULL, pcpu_rt);
1139         BUG_ON(prev);
1140
1141         rt6_dst_from_metrics_check(pcpu_rt);
1142         return pcpu_rt;
1143 }
1144
1145 /* exception hash table implementation
1146  */
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1148
1149 /* Remove rt6_ex from hash table and free the memory
1150  * Caller must hold rt6_exception_lock
1151  */
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153                                  struct rt6_exception *rt6_ex)
1154 {
1155         struct net *net;
1156
1157         if (!bucket || !rt6_ex)
1158                 return;
1159
1160         net = dev_net(rt6_ex->rt6i->dst.dev);
1161         rt6_ex->rt6i->rt6i_node = NULL;
1162         hlist_del_rcu(&rt6_ex->hlist);
1163         rt6_release(rt6_ex->rt6i);
1164         kfree_rcu(rt6_ex, rcu);
1165         WARN_ON_ONCE(!bucket->depth);
1166         bucket->depth--;
1167         net->ipv6.rt6_stats->fib_rt_cache--;
1168 }
1169
1170 /* Remove oldest rt6_ex in bucket and free the memory
1171  * Caller must hold rt6_exception_lock
1172  */
1173 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1174 {
1175         struct rt6_exception *rt6_ex, *oldest = NULL;
1176
1177         if (!bucket)
1178                 return;
1179
1180         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1181                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1182                         oldest = rt6_ex;
1183         }
1184         rt6_remove_exception(bucket, oldest);
1185 }
1186
1187 static u32 rt6_exception_hash(const struct in6_addr *dst,
1188                               const struct in6_addr *src)
1189 {
1190         static u32 seed __read_mostly;
1191         u32 val;
1192
1193         net_get_random_once(&seed, sizeof(seed));
1194         val = jhash(dst, sizeof(*dst), seed);
1195
1196 #ifdef CONFIG_IPV6_SUBTREES
1197         if (src)
1198                 val = jhash(src, sizeof(*src), val);
1199 #endif
1200         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1201 }
1202
1203 /* Helper function to find the cached rt in the hash table
1204  * and update bucket pointer to point to the bucket for this
1205  * (daddr, saddr) pair
1206  * Caller must hold rt6_exception_lock
1207  */
1208 static struct rt6_exception *
1209 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1210                               const struct in6_addr *daddr,
1211                               const struct in6_addr *saddr)
1212 {
1213         struct rt6_exception *rt6_ex;
1214         u32 hval;
1215
1216         if (!(*bucket) || !daddr)
1217                 return NULL;
1218
1219         hval = rt6_exception_hash(daddr, saddr);
1220         *bucket += hval;
1221
1222         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1223                 struct rt6_info *rt6 = rt6_ex->rt6i;
1224                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1225
1226 #ifdef CONFIG_IPV6_SUBTREES
1227                 if (matched && saddr)
1228                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1229 #endif
1230                 if (matched)
1231                         return rt6_ex;
1232         }
1233         return NULL;
1234 }
1235
1236 /* Helper function to find the cached rt in the hash table
1237  * and update bucket pointer to point to the bucket for this
1238  * (daddr, saddr) pair
1239  * Caller must hold rcu_read_lock()
1240  */
1241 static struct rt6_exception *
1242 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1243                          const struct in6_addr *daddr,
1244                          const struct in6_addr *saddr)
1245 {
1246         struct rt6_exception *rt6_ex;
1247         u32 hval;
1248
1249         WARN_ON_ONCE(!rcu_read_lock_held());
1250
1251         if (!(*bucket) || !daddr)
1252                 return NULL;
1253
1254         hval = rt6_exception_hash(daddr, saddr);
1255         *bucket += hval;
1256
1257         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1258                 struct rt6_info *rt6 = rt6_ex->rt6i;
1259                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1260
1261 #ifdef CONFIG_IPV6_SUBTREES
1262                 if (matched && saddr)
1263                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1264 #endif
1265                 if (matched)
1266                         return rt6_ex;
1267         }
1268         return NULL;
1269 }
1270
1271 static int rt6_insert_exception(struct rt6_info *nrt,
1272                                 struct rt6_info *ort)
1273 {
1274         struct net *net = dev_net(ort->dst.dev);
1275         struct rt6_exception_bucket *bucket;
1276         struct in6_addr *src_key = NULL;
1277         struct rt6_exception *rt6_ex;
1278         int err = 0;
1279
1280         /* ort can't be a cache or pcpu route */
1281         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1282                 ort = ort->from;
1283         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1284
1285         spin_lock_bh(&rt6_exception_lock);
1286
1287         if (ort->exception_bucket_flushed) {
1288                 err = -EINVAL;
1289                 goto out;
1290         }
1291
1292         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1293                                         lockdep_is_held(&rt6_exception_lock));
1294         if (!bucket) {
1295                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1296                                  GFP_ATOMIC);
1297                 if (!bucket) {
1298                         err = -ENOMEM;
1299                         goto out;
1300                 }
1301                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1302         }
1303
1304 #ifdef CONFIG_IPV6_SUBTREES
1305         /* rt6i_src.plen != 0 indicates ort is in subtree
1306          * and exception table is indexed by a hash of
1307          * both rt6i_dst and rt6i_src.
1308          * Otherwise, the exception table is indexed by
1309          * a hash of only rt6i_dst.
1310          */
1311         if (ort->rt6i_src.plen)
1312                 src_key = &nrt->rt6i_src.addr;
1313 #endif
1314
1315         /* Update rt6i_prefsrc as it could be changed
1316          * in rt6_remove_prefsrc()
1317          */
1318         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1319         /* rt6_mtu_change() might lower mtu on ort.
1320          * Only insert this exception route if its mtu
1321          * is less than ort's mtu value.
1322          */
1323         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1324                 err = -EINVAL;
1325                 goto out;
1326         }
1327
1328         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1329                                                src_key);
1330         if (rt6_ex)
1331                 rt6_remove_exception(bucket, rt6_ex);
1332
1333         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1334         if (!rt6_ex) {
1335                 err = -ENOMEM;
1336                 goto out;
1337         }
1338         rt6_ex->rt6i = nrt;
1339         rt6_ex->stamp = jiffies;
1340         atomic_inc(&nrt->rt6i_ref);
1341         nrt->rt6i_node = ort->rt6i_node;
1342         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1343         bucket->depth++;
1344         net->ipv6.rt6_stats->fib_rt_cache++;
1345
1346         if (bucket->depth > FIB6_MAX_DEPTH)
1347                 rt6_exception_remove_oldest(bucket);
1348
1349 out:
1350         spin_unlock_bh(&rt6_exception_lock);
1351
1352         /* Update fn->fn_sernum to invalidate all cached dst */
1353         if (!err) {
1354                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1355                 fib6_update_sernum(ort);
1356                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1357                 fib6_force_start_gc(net);
1358         }
1359
1360         return err;
1361 }
1362
1363 void rt6_flush_exceptions(struct rt6_info *rt)
1364 {
1365         struct rt6_exception_bucket *bucket;
1366         struct rt6_exception *rt6_ex;
1367         struct hlist_node *tmp;
1368         int i;
1369
1370         spin_lock_bh(&rt6_exception_lock);
1371         /* Prevent rt6_insert_exception() to recreate the bucket list */
1372         rt->exception_bucket_flushed = 1;
1373
1374         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1375                                     lockdep_is_held(&rt6_exception_lock));
1376         if (!bucket)
1377                 goto out;
1378
1379         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1380                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1381                         rt6_remove_exception(bucket, rt6_ex);
1382                 WARN_ON_ONCE(bucket->depth);
1383                 bucket++;
1384         }
1385
1386 out:
1387         spin_unlock_bh(&rt6_exception_lock);
1388 }
1389
1390 /* Find cached rt in the hash table inside passed in rt
1391  * Caller has to hold rcu_read_lock()
1392  */
1393 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1394                                            struct in6_addr *daddr,
1395                                            struct in6_addr *saddr)
1396 {
1397         struct rt6_exception_bucket *bucket;
1398         struct in6_addr *src_key = NULL;
1399         struct rt6_exception *rt6_ex;
1400         struct rt6_info *res = NULL;
1401
1402         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1403
1404 #ifdef CONFIG_IPV6_SUBTREES
1405         /* rt6i_src.plen != 0 indicates rt is in subtree
1406          * and exception table is indexed by a hash of
1407          * both rt6i_dst and rt6i_src.
1408          * Otherwise, the exception table is indexed by
1409          * a hash of only rt6i_dst.
1410          */
1411         if (rt->rt6i_src.plen)
1412                 src_key = saddr;
1413 #endif
1414         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1415
1416         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1417                 res = rt6_ex->rt6i;
1418
1419         return res;
1420 }
1421
1422 /* Remove the passed in cached rt from the hash table that contains it */
1423 int rt6_remove_exception_rt(struct rt6_info *rt)
1424 {
1425         struct rt6_exception_bucket *bucket;
1426         struct rt6_info *from = rt->from;
1427         struct in6_addr *src_key = NULL;
1428         struct rt6_exception *rt6_ex;
1429         int err;
1430
1431         if (!from ||
1432             !(rt->rt6i_flags & RTF_CACHE))
1433                 return -EINVAL;
1434
1435         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1436                 return -ENOENT;
1437
1438         spin_lock_bh(&rt6_exception_lock);
1439         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1440                                     lockdep_is_held(&rt6_exception_lock));
1441 #ifdef CONFIG_IPV6_SUBTREES
1442         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1443          * and exception table is indexed by a hash of
1444          * both rt6i_dst and rt6i_src.
1445          * Otherwise, the exception table is indexed by
1446          * a hash of only rt6i_dst.
1447          */
1448         if (from->rt6i_src.plen)
1449                 src_key = &rt->rt6i_src.addr;
1450 #endif
1451         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1452                                                &rt->rt6i_dst.addr,
1453                                                src_key);
1454         if (rt6_ex) {
1455                 rt6_remove_exception(bucket, rt6_ex);
1456                 err = 0;
1457         } else {
1458                 err = -ENOENT;
1459         }
1460
1461         spin_unlock_bh(&rt6_exception_lock);
1462         return err;
1463 }
1464
1465 /* Find rt6_ex which contains the passed in rt cache and
1466  * refresh its stamp
1467  */
1468 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1469 {
1470         struct rt6_exception_bucket *bucket;
1471         struct rt6_info *from = rt->from;
1472         struct in6_addr *src_key = NULL;
1473         struct rt6_exception *rt6_ex;
1474
1475         if (!from ||
1476             !(rt->rt6i_flags & RTF_CACHE))
1477                 return;
1478
1479         rcu_read_lock();
1480         bucket = rcu_dereference(from->rt6i_exception_bucket);
1481
1482 #ifdef CONFIG_IPV6_SUBTREES
1483         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1484          * and exception table is indexed by a hash of
1485          * both rt6i_dst and rt6i_src.
1486          * Otherwise, the exception table is indexed by
1487          * a hash of only rt6i_dst.
1488          */
1489         if (from->rt6i_src.plen)
1490                 src_key = &rt->rt6i_src.addr;
1491 #endif
1492         rt6_ex = __rt6_find_exception_rcu(&bucket,
1493                                           &rt->rt6i_dst.addr,
1494                                           src_key);
1495         if (rt6_ex)
1496                 rt6_ex->stamp = jiffies;
1497
1498         rcu_read_unlock();
1499 }
1500
1501 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1502 {
1503         struct rt6_exception_bucket *bucket;
1504         struct rt6_exception *rt6_ex;
1505         int i;
1506
1507         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1508                                         lockdep_is_held(&rt6_exception_lock));
1509
1510         if (bucket) {
1511                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1513                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1514                         }
1515                         bucket++;
1516                 }
1517         }
1518 }
1519
1520 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1521                                          struct rt6_info *rt, int mtu)
1522 {
1523         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1524          * lowest MTU in the path: always allow updating the route PMTU to
1525          * reflect PMTU decreases.
1526          *
1527          * If the new MTU is higher, and the route PMTU is equal to the local
1528          * MTU, this means the old MTU is the lowest in the path, so allow
1529          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1530          * handle this.
1531          */
1532
1533         if (dst_mtu(&rt->dst) >= mtu)
1534                 return true;
1535
1536         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1537                 return true;
1538
1539         return false;
1540 }
1541
1542 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543                                        struct rt6_info *rt, int mtu)
1544 {
1545         struct rt6_exception_bucket *bucket;
1546         struct rt6_exception *rt6_ex;
1547         int i;
1548
1549         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550                                         lockdep_is_held(&rt6_exception_lock));
1551
1552         if (!bucket)
1553                 return;
1554
1555         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1556                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1557                         struct rt6_info *entry = rt6_ex->rt6i;
1558
1559                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560                          * route), the metrics of its rt->dst.from have already
1561                          * been updated.
1562                          */
1563                         if (entry->rt6i_pmtu &&
1564                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1565                                 entry->rt6i_pmtu = mtu;
1566                 }
1567                 bucket++;
1568         }
1569 }
1570
1571 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1572
1573 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1574                                         struct in6_addr *gateway)
1575 {
1576         struct rt6_exception_bucket *bucket;
1577         struct rt6_exception *rt6_ex;
1578         struct hlist_node *tmp;
1579         int i;
1580
1581         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1582                 return;
1583
1584         spin_lock_bh(&rt6_exception_lock);
1585         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1586                                      lockdep_is_held(&rt6_exception_lock));
1587
1588         if (bucket) {
1589                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1590                         hlist_for_each_entry_safe(rt6_ex, tmp,
1591                                                   &bucket->chain, hlist) {
1592                                 struct rt6_info *entry = rt6_ex->rt6i;
1593
1594                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1595                                     RTF_CACHE_GATEWAY &&
1596                                     ipv6_addr_equal(gateway,
1597                                                     &entry->rt6i_gateway)) {
1598                                         rt6_remove_exception(bucket, rt6_ex);
1599                                 }
1600                         }
1601                         bucket++;
1602                 }
1603         }
1604
1605         spin_unlock_bh(&rt6_exception_lock);
1606 }
1607
1608 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1609                                       struct rt6_exception *rt6_ex,
1610                                       struct fib6_gc_args *gc_args,
1611                                       unsigned long now)
1612 {
1613         struct rt6_info *rt = rt6_ex->rt6i;
1614
1615         /* we are pruning and obsoleting aged-out and non gateway exceptions
1616          * even if others have still references to them, so that on next
1617          * dst_check() such references can be dropped.
1618          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1619          * expired, independently from their aging, as per RFC 8201 section 4
1620          */
1621         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1622                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1623                         RT6_TRACE("aging clone %p\n", rt);
1624                         rt6_remove_exception(bucket, rt6_ex);
1625                         return;
1626                 }
1627         } else if (time_after(jiffies, rt->dst.expires)) {
1628                 RT6_TRACE("purging expired route %p\n", rt);
1629                 rt6_remove_exception(bucket, rt6_ex);
1630                 return;
1631         }
1632
1633         if (rt->rt6i_flags & RTF_GATEWAY) {
1634                 struct neighbour *neigh;
1635                 __u8 neigh_flags = 0;
1636
1637                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1638                 if (neigh)
1639                         neigh_flags = neigh->flags;
1640
1641                 if (!(neigh_flags & NTF_ROUTER)) {
1642                         RT6_TRACE("purging route %p via non-router but gateway\n",
1643                                   rt);
1644                         rt6_remove_exception(bucket, rt6_ex);
1645                         return;
1646                 }
1647         }
1648
1649         gc_args->more++;
1650 }
1651
1652 void rt6_age_exceptions(struct rt6_info *rt,
1653                         struct fib6_gc_args *gc_args,
1654                         unsigned long now)
1655 {
1656         struct rt6_exception_bucket *bucket;
1657         struct rt6_exception *rt6_ex;
1658         struct hlist_node *tmp;
1659         int i;
1660
1661         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1662                 return;
1663
1664         rcu_read_lock_bh();
1665         spin_lock(&rt6_exception_lock);
1666         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667                                     lockdep_is_held(&rt6_exception_lock));
1668
1669         if (bucket) {
1670                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671                         hlist_for_each_entry_safe(rt6_ex, tmp,
1672                                                   &bucket->chain, hlist) {
1673                                 rt6_age_examine_exception(bucket, rt6_ex,
1674                                                           gc_args, now);
1675                         }
1676                         bucket++;
1677                 }
1678         }
1679         spin_unlock(&rt6_exception_lock);
1680         rcu_read_unlock_bh();
1681 }
1682
1683 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1684                                int oif, struct flowi6 *fl6,
1685                                const struct sk_buff *skb, int flags)
1686 {
1687         struct fib6_node *fn, *saved_fn;
1688         struct rt6_info *rt, *rt_cache;
1689         int strict = 0;
1690
1691         strict |= flags & RT6_LOOKUP_F_IFACE;
1692         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1693         if (net->ipv6.devconf_all->forwarding == 0)
1694                 strict |= RT6_LOOKUP_F_REACHABLE;
1695
1696         rcu_read_lock();
1697
1698         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1699         saved_fn = fn;
1700
1701         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1702                 oif = 0;
1703
1704 redo_rt6_select:
1705         rt = rt6_select(net, fn, oif, strict);
1706         if (rt->rt6i_nsiblings)
1707                 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1708         if (rt == net->ipv6.ip6_null_entry) {
1709                 fn = fib6_backtrack(fn, &fl6->saddr);
1710                 if (fn)
1711                         goto redo_rt6_select;
1712                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1713                         /* also consider unreachable route */
1714                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1715                         fn = saved_fn;
1716                         goto redo_rt6_select;
1717                 }
1718         }
1719
1720         /*Search through exception table */
1721         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1722         if (rt_cache)
1723                 rt = rt_cache;
1724
1725         if (rt == net->ipv6.ip6_null_entry) {
1726                 rcu_read_unlock();
1727                 dst_hold(&rt->dst);
1728                 trace_fib6_table_lookup(net, rt, table, fl6);
1729                 return rt;
1730         } else if (rt->rt6i_flags & RTF_CACHE) {
1731                 if (ip6_hold_safe(net, &rt, true)) {
1732                         dst_use_noref(&rt->dst, jiffies);
1733                         rt6_dst_from_metrics_check(rt);
1734                 }
1735                 rcu_read_unlock();
1736                 trace_fib6_table_lookup(net, rt, table, fl6);
1737                 return rt;
1738         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1740                 /* Create a RTF_CACHE clone which will not be
1741                  * owned by the fib6 tree.  It is for the special case where
1742                  * the daddr in the skb during the neighbor look-up is different
1743                  * from the fl6->daddr used to look-up route here.
1744                  */
1745
1746                 struct rt6_info *uncached_rt;
1747
1748                 if (ip6_hold_safe(net, &rt, true)) {
1749                         dst_use_noref(&rt->dst, jiffies);
1750                 } else {
1751                         rcu_read_unlock();
1752                         uncached_rt = rt;
1753                         goto uncached_rt_out;
1754                 }
1755                 rcu_read_unlock();
1756
1757                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1758                 dst_release(&rt->dst);
1759
1760                 if (uncached_rt) {
1761                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1762                          * No need for another dst_hold()
1763                          */
1764                         rt6_uncached_list_add(uncached_rt);
1765                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1766                 } else {
1767                         uncached_rt = net->ipv6.ip6_null_entry;
1768                         dst_hold(&uncached_rt->dst);
1769                 }
1770
1771 uncached_rt_out:
1772                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1773                 return uncached_rt;
1774
1775         } else {
1776                 /* Get a percpu copy */
1777
1778                 struct rt6_info *pcpu_rt;
1779
1780                 dst_use_noref(&rt->dst, jiffies);
1781                 local_bh_disable();
1782                 pcpu_rt = rt6_get_pcpu_route(rt);
1783
1784                 if (!pcpu_rt) {
1785                         /* atomic_inc_not_zero() is needed when using rcu */
1786                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1787                                 /* No dst_hold() on rt is needed because grabbing
1788                                  * rt->rt6i_ref makes sure rt can't be released.
1789                                  */
1790                                 pcpu_rt = rt6_make_pcpu_route(rt);
1791                                 rt6_release(rt);
1792                         } else {
1793                                 /* rt is already removed from tree */
1794                                 pcpu_rt = net->ipv6.ip6_null_entry;
1795                                 dst_hold(&pcpu_rt->dst);
1796                         }
1797                 }
1798                 local_bh_enable();
1799                 rcu_read_unlock();
1800                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1801                 return pcpu_rt;
1802         }
1803 }
1804 EXPORT_SYMBOL_GPL(ip6_pol_route);
1805
1806 static struct rt6_info *ip6_pol_route_input(struct net *net,
1807                                             struct fib6_table *table,
1808                                             struct flowi6 *fl6,
1809                                             const struct sk_buff *skb,
1810                                             int flags)
1811 {
1812         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1813 }
1814
1815 struct dst_entry *ip6_route_input_lookup(struct net *net,
1816                                          struct net_device *dev,
1817                                          struct flowi6 *fl6,
1818                                          const struct sk_buff *skb,
1819                                          int flags)
1820 {
1821         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1822                 flags |= RT6_LOOKUP_F_IFACE;
1823
1824         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1825 }
1826 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1827
1828 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1829                                   struct flow_keys *keys,
1830                                   struct flow_keys *flkeys)
1831 {
1832         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1833         const struct ipv6hdr *key_iph = outer_iph;
1834         struct flow_keys *_flkeys = flkeys;
1835         const struct ipv6hdr *inner_iph;
1836         const struct icmp6hdr *icmph;
1837         struct ipv6hdr _inner_iph;
1838
1839         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1840                 goto out;
1841
1842         icmph = icmp6_hdr(skb);
1843         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1844             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1845             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1846             icmph->icmp6_type != ICMPV6_PARAMPROB)
1847                 goto out;
1848
1849         inner_iph = skb_header_pointer(skb,
1850                                        skb_transport_offset(skb) + sizeof(*icmph),
1851                                        sizeof(_inner_iph), &_inner_iph);
1852         if (!inner_iph)
1853                 goto out;
1854
1855         key_iph = inner_iph;
1856         _flkeys = NULL;
1857 out:
1858         if (_flkeys) {
1859                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1860                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1861                 keys->tags.flow_label = _flkeys->tags.flow_label;
1862                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1863         } else {
1864                 keys->addrs.v6addrs.src = key_iph->saddr;
1865                 keys->addrs.v6addrs.dst = key_iph->daddr;
1866                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1867                 keys->basic.ip_proto = key_iph->nexthdr;
1868         }
1869 }
1870
1871 /* if skb is set it will be used and fl6 can be NULL */
1872 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1873                        const struct sk_buff *skb, struct flow_keys *flkeys)
1874 {
1875         struct flow_keys hash_keys;
1876         u32 mhash;
1877
1878         switch (ip6_multipath_hash_policy(net)) {
1879         case 0:
1880                 memset(&hash_keys, 0, sizeof(hash_keys));
1881                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1882                 if (skb) {
1883                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1884                 } else {
1885                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1886                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1887                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1888                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1889                 }
1890                 break;
1891         case 1:
1892                 if (skb) {
1893                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1894                         struct flow_keys keys;
1895
1896                         /* short-circuit if we already have L4 hash present */
1897                         if (skb->l4_hash)
1898                                 return skb_get_hash_raw(skb) >> 1;
1899
1900                         memset(&hash_keys, 0, sizeof(hash_keys));
1901
1902                         if (!flkeys) {
1903                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1904                                 flkeys = &keys;
1905                         }
1906                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1907                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1908                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1909                         hash_keys.ports.src = flkeys->ports.src;
1910                         hash_keys.ports.dst = flkeys->ports.dst;
1911                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1912                 } else {
1913                         memset(&hash_keys, 0, sizeof(hash_keys));
1914                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1915                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1916                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1917                         hash_keys.ports.src = fl6->fl6_sport;
1918                         hash_keys.ports.dst = fl6->fl6_dport;
1919                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1920                 }
1921                 break;
1922         }
1923         mhash = flow_hash_from_keys(&hash_keys);
1924
1925         return mhash >> 1;
1926 }
1927
1928 void ip6_route_input(struct sk_buff *skb)
1929 {
1930         const struct ipv6hdr *iph = ipv6_hdr(skb);
1931         struct net *net = dev_net(skb->dev);
1932         int flags = RT6_LOOKUP_F_HAS_SADDR;
1933         struct ip_tunnel_info *tun_info;
1934         struct flowi6 fl6 = {
1935                 .flowi6_iif = skb->dev->ifindex,
1936                 .daddr = iph->daddr,
1937                 .saddr = iph->saddr,
1938                 .flowlabel = ip6_flowinfo(iph),
1939                 .flowi6_mark = skb->mark,
1940                 .flowi6_proto = iph->nexthdr,
1941         };
1942         struct flow_keys *flkeys = NULL, _flkeys;
1943
1944         tun_info = skb_tunnel_info(skb);
1945         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1946                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1947
1948         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1949                 flkeys = &_flkeys;
1950
1951         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1952                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1953         skb_dst_drop(skb);
1954         skb_dst_set(skb,
1955                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1956 }
1957
1958 static struct rt6_info *ip6_pol_route_output(struct net *net,
1959                                              struct fib6_table *table,
1960                                              struct flowi6 *fl6,
1961                                              const struct sk_buff *skb,
1962                                              int flags)
1963 {
1964         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1965 }
1966
1967 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1968                                          struct flowi6 *fl6, int flags)
1969 {
1970         bool any_src;
1971
1972         if (rt6_need_strict(&fl6->daddr)) {
1973                 struct dst_entry *dst;
1974
1975                 dst = l3mdev_link_scope_lookup(net, fl6);
1976                 if (dst)
1977                         return dst;
1978         }
1979
1980         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1981
1982         any_src = ipv6_addr_any(&fl6->saddr);
1983         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1984             (fl6->flowi6_oif && any_src))
1985                 flags |= RT6_LOOKUP_F_IFACE;
1986
1987         if (!any_src)
1988                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1989         else if (sk)
1990                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1991
1992         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1993 }
1994 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1995
1996 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1997 {
1998         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1999         struct net_device *loopback_dev = net->loopback_dev;
2000         struct dst_entry *new = NULL;
2001
2002         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2003                        DST_OBSOLETE_DEAD, 0);
2004         if (rt) {
2005                 rt6_info_init(rt);
2006                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2007
2008                 new = &rt->dst;
2009                 new->__use = 1;
2010                 new->input = dst_discard;
2011                 new->output = dst_discard_out;
2012
2013                 dst_copy_metrics(new, &ort->dst);
2014
2015                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2016                 rt->rt6i_gateway = ort->rt6i_gateway;
2017                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2018                 rt->rt6i_metric = 0;
2019
2020                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2021 #ifdef CONFIG_IPV6_SUBTREES
2022                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2023 #endif
2024         }
2025
2026         dst_release(dst_orig);
2027         return new ? new : ERR_PTR(-ENOMEM);
2028 }
2029
2030 /*
2031  *      Destination cache support functions
2032  */
2033
2034 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2035 {
2036         if (rt->from &&
2037             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2038                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2039 }
2040
2041 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2042 {
2043         u32 rt_cookie = 0;
2044
2045         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2046                 return NULL;
2047
2048         if (rt6_check_expired(rt))
2049                 return NULL;
2050
2051         return &rt->dst;
2052 }
2053
2054 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2055 {
2056         if (!__rt6_check_expired(rt) &&
2057             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2058             rt6_check(rt->from, cookie))
2059                 return &rt->dst;
2060         else
2061                 return NULL;
2062 }
2063
2064 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2065 {
2066         struct rt6_info *rt;
2067
2068         rt = (struct rt6_info *) dst;
2069
2070         /* All IPV6 dsts are created with ->obsolete set to the value
2071          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2072          * into this function always.
2073          */
2074
2075         rt6_dst_from_metrics_check(rt);
2076
2077         if (rt->rt6i_flags & RTF_PCPU ||
2078             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2079                 return rt6_dst_from_check(rt, cookie);
2080         else
2081                 return rt6_check(rt, cookie);
2082 }
2083
2084 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2085 {
2086         struct rt6_info *rt = (struct rt6_info *) dst;
2087
2088         if (rt) {
2089                 if (rt->rt6i_flags & RTF_CACHE) {
2090                         if (rt6_check_expired(rt)) {
2091                                 ip6_del_rt(rt);
2092                                 dst = NULL;
2093                         }
2094                 } else {
2095                         dst_release(dst);
2096                         dst = NULL;
2097                 }
2098         }
2099         return dst;
2100 }
2101
2102 static void ip6_link_failure(struct sk_buff *skb)
2103 {
2104         struct rt6_info *rt;
2105
2106         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2107
2108         rt = (struct rt6_info *) skb_dst(skb);
2109         if (rt) {
2110                 if (rt->rt6i_flags & RTF_CACHE) {
2111                         if (dst_hold_safe(&rt->dst))
2112                                 ip6_del_rt(rt);
2113                 } else {
2114                         struct fib6_node *fn;
2115
2116                         rcu_read_lock();
2117                         fn = rcu_dereference(rt->rt6i_node);
2118                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2119                                 fn->fn_sernum = -1;
2120                         rcu_read_unlock();
2121                 }
2122         }
2123 }
2124
2125 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2126 {
2127         struct net *net = dev_net(rt->dst.dev);
2128
2129         rt->rt6i_flags |= RTF_MODIFIED;
2130         rt->rt6i_pmtu = mtu;
2131         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2132 }
2133
2134 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2135 {
2136         return !(rt->rt6i_flags & RTF_CACHE) &&
2137                 (rt->rt6i_flags & RTF_PCPU ||
2138                  rcu_access_pointer(rt->rt6i_node));
2139 }
2140
2141 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2142                                  const struct ipv6hdr *iph, u32 mtu)
2143 {
2144         const struct in6_addr *daddr, *saddr;
2145         struct rt6_info *rt6 = (struct rt6_info *)dst;
2146
2147         if (rt6->rt6i_flags & RTF_LOCAL)
2148                 return;
2149
2150         if (dst_metric_locked(dst, RTAX_MTU))
2151                 return;
2152
2153         if (iph) {
2154                 daddr = &iph->daddr;
2155                 saddr = &iph->saddr;
2156         } else if (sk) {
2157                 daddr = &sk->sk_v6_daddr;
2158                 saddr = &inet6_sk(sk)->saddr;
2159         } else {
2160                 daddr = NULL;
2161                 saddr = NULL;
2162         }
2163         dst_confirm_neigh(dst, daddr);
2164         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2165         if (mtu >= dst_mtu(dst))
2166                 return;
2167
2168         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2169                 rt6_do_update_pmtu(rt6, mtu);
2170                 /* update rt6_ex->stamp for cache */
2171                 if (rt6->rt6i_flags & RTF_CACHE)
2172                         rt6_update_exception_stamp_rt(rt6);
2173         } else if (daddr) {
2174                 struct rt6_info *nrt6;
2175
2176                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2177                 if (nrt6) {
2178                         rt6_do_update_pmtu(nrt6, mtu);
2179                         if (rt6_insert_exception(nrt6, rt6))
2180                                 dst_release_immediate(&nrt6->dst);
2181                 }
2182         }
2183 }
2184
2185 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2186                                struct sk_buff *skb, u32 mtu)
2187 {
2188         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2189 }
2190
2191 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2192                      int oif, u32 mark, kuid_t uid)
2193 {
2194         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2195         struct dst_entry *dst;
2196         struct flowi6 fl6;
2197
2198         memset(&fl6, 0, sizeof(fl6));
2199         fl6.flowi6_oif = oif;
2200         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2201         fl6.daddr = iph->daddr;
2202         fl6.saddr = iph->saddr;
2203         fl6.flowlabel = ip6_flowinfo(iph);
2204         fl6.flowi6_uid = uid;
2205
2206         dst = ip6_route_output(net, NULL, &fl6);
2207         if (!dst->error)
2208                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2209         dst_release(dst);
2210 }
2211 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2212
2213 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2214 {
2215         struct dst_entry *dst;
2216
2217         ip6_update_pmtu(skb, sock_net(sk), mtu,
2218                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2219
2220         dst = __sk_dst_get(sk);
2221         if (!dst || !dst->obsolete ||
2222             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2223                 return;
2224
2225         bh_lock_sock(sk);
2226         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2227                 ip6_datagram_dst_update(sk, false);
2228         bh_unlock_sock(sk);
2229 }
2230 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2231
2232 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2233                            const struct flowi6 *fl6)
2234 {
2235 #ifdef CONFIG_IPV6_SUBTREES
2236         struct ipv6_pinfo *np = inet6_sk(sk);
2237 #endif
2238
2239         ip6_dst_store(sk, dst,
2240                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2241                       &sk->sk_v6_daddr : NULL,
2242 #ifdef CONFIG_IPV6_SUBTREES
2243                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2244                       &np->saddr :
2245 #endif
2246                       NULL);
2247 }
2248
2249 /* Handle redirects */
2250 struct ip6rd_flowi {
2251         struct flowi6 fl6;
2252         struct in6_addr gateway;
2253 };
2254
2255 static struct rt6_info *__ip6_route_redirect(struct net *net,
2256                                              struct fib6_table *table,
2257                                              struct flowi6 *fl6,
2258                                              const struct sk_buff *skb,
2259                                              int flags)
2260 {
2261         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2262         struct rt6_info *rt, *rt_cache;
2263         struct fib6_node *fn;
2264
2265         /* Get the "current" route for this destination and
2266          * check if the redirect has come from appropriate router.
2267          *
2268          * RFC 4861 specifies that redirects should only be
2269          * accepted if they come from the nexthop to the target.
2270          * Due to the way the routes are chosen, this notion
2271          * is a bit fuzzy and one might need to check all possible
2272          * routes.
2273          */
2274
2275         rcu_read_lock();
2276         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2277 restart:
2278         for_each_fib6_node_rt_rcu(fn) {
2279                 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2280                         continue;
2281                 if (rt6_check_expired(rt))
2282                         continue;
2283                 if (rt->dst.error)
2284                         break;
2285                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2286                         continue;
2287                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2288                         continue;
2289                 /* rt_cache's gateway might be different from its 'parent'
2290                  * in the case of an ip redirect.
2291                  * So we keep searching in the exception table if the gateway
2292                  * is different.
2293                  */
2294                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2295                         rt_cache = rt6_find_cached_rt(rt,
2296                                                       &fl6->daddr,
2297                                                       &fl6->saddr);
2298                         if (rt_cache &&
2299                             ipv6_addr_equal(&rdfl->gateway,
2300                                             &rt_cache->rt6i_gateway)) {
2301                                 rt = rt_cache;
2302                                 break;
2303                         }
2304                         continue;
2305                 }
2306                 break;
2307         }
2308
2309         if (!rt)
2310                 rt = net->ipv6.ip6_null_entry;
2311         else if (rt->dst.error) {
2312                 rt = net->ipv6.ip6_null_entry;
2313                 goto out;
2314         }
2315
2316         if (rt == net->ipv6.ip6_null_entry) {
2317                 fn = fib6_backtrack(fn, &fl6->saddr);
2318                 if (fn)
2319                         goto restart;
2320         }
2321
2322 out:
2323         ip6_hold_safe(net, &rt, true);
2324
2325         rcu_read_unlock();
2326
2327         trace_fib6_table_lookup(net, rt, table, fl6);
2328         return rt;
2329 };
2330
2331 static struct dst_entry *ip6_route_redirect(struct net *net,
2332                                             const struct flowi6 *fl6,
2333                                             const struct sk_buff *skb,
2334                                             const struct in6_addr *gateway)
2335 {
2336         int flags = RT6_LOOKUP_F_HAS_SADDR;
2337         struct ip6rd_flowi rdfl;
2338
2339         rdfl.fl6 = *fl6;
2340         rdfl.gateway = *gateway;
2341
2342         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2343                                 flags, __ip6_route_redirect);
2344 }
2345
2346 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2347                   kuid_t uid)
2348 {
2349         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2350         struct dst_entry *dst;
2351         struct flowi6 fl6;
2352
2353         memset(&fl6, 0, sizeof(fl6));
2354         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2355         fl6.flowi6_oif = oif;
2356         fl6.flowi6_mark = mark;
2357         fl6.daddr = iph->daddr;
2358         fl6.saddr = iph->saddr;
2359         fl6.flowlabel = ip6_flowinfo(iph);
2360         fl6.flowi6_uid = uid;
2361
2362         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2363         rt6_do_redirect(dst, NULL, skb);
2364         dst_release(dst);
2365 }
2366 EXPORT_SYMBOL_GPL(ip6_redirect);
2367
2368 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2369                             u32 mark)
2370 {
2371         const struct ipv6hdr *iph = ipv6_hdr(skb);
2372         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2373         struct dst_entry *dst;
2374         struct flowi6 fl6;
2375
2376         memset(&fl6, 0, sizeof(fl6));
2377         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2378         fl6.flowi6_oif = oif;
2379         fl6.flowi6_mark = mark;
2380         fl6.daddr = msg->dest;
2381         fl6.saddr = iph->daddr;
2382         fl6.flowi6_uid = sock_net_uid(net, NULL);
2383
2384         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2385         rt6_do_redirect(dst, NULL, skb);
2386         dst_release(dst);
2387 }
2388
2389 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2390 {
2391         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2392                      sk->sk_uid);
2393 }
2394 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2395
2396 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2397 {
2398         struct net_device *dev = dst->dev;
2399         unsigned int mtu = dst_mtu(dst);
2400         struct net *net = dev_net(dev);
2401
2402         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2403
2404         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2405                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2406
2407         /*
2408          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2409          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2410          * IPV6_MAXPLEN is also valid and means: "any MSS,
2411          * rely only on pmtu discovery"
2412          */
2413         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2414                 mtu = IPV6_MAXPLEN;
2415         return mtu;
2416 }
2417
2418 static unsigned int ip6_mtu(const struct dst_entry *dst)
2419 {
2420         const struct rt6_info *rt = (const struct rt6_info *)dst;
2421         unsigned int mtu = rt->rt6i_pmtu;
2422         struct inet6_dev *idev;
2423
2424         if (mtu)
2425                 goto out;
2426
2427         mtu = dst_metric_raw(dst, RTAX_MTU);
2428         if (mtu)
2429                 goto out;
2430
2431         mtu = IPV6_MIN_MTU;
2432
2433         rcu_read_lock();
2434         idev = __in6_dev_get(dst->dev);
2435         if (idev)
2436                 mtu = idev->cnf.mtu6;
2437         rcu_read_unlock();
2438
2439 out:
2440         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2441
2442         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2443 }
2444
2445 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2446                                   struct flowi6 *fl6)
2447 {
2448         struct dst_entry *dst;
2449         struct rt6_info *rt;
2450         struct inet6_dev *idev = in6_dev_get(dev);
2451         struct net *net = dev_net(dev);
2452
2453         if (unlikely(!idev))
2454                 return ERR_PTR(-ENODEV);
2455
2456         rt = ip6_dst_alloc(net, dev, 0);
2457         if (unlikely(!rt)) {
2458                 in6_dev_put(idev);
2459                 dst = ERR_PTR(-ENOMEM);
2460                 goto out;
2461         }
2462
2463         rt->dst.flags |= DST_HOST;
2464         rt->dst.input = ip6_input;
2465         rt->dst.output  = ip6_output;
2466         rt->rt6i_gateway  = fl6->daddr;
2467         rt->rt6i_dst.addr = fl6->daddr;
2468         rt->rt6i_dst.plen = 128;
2469         rt->rt6i_idev     = idev;
2470         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2471
2472         /* Add this dst into uncached_list so that rt6_disable_ip() can
2473          * do proper release of the net_device
2474          */
2475         rt6_uncached_list_add(rt);
2476         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2477
2478         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2479
2480 out:
2481         return dst;
2482 }
2483
2484 static int ip6_dst_gc(struct dst_ops *ops)
2485 {
2486         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2487         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2488         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2489         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2490         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2491         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2492         int entries;
2493
2494         entries = dst_entries_get_fast(ops);
2495         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2496             entries <= rt_max_size)
2497                 goto out;
2498
2499         net->ipv6.ip6_rt_gc_expire++;
2500         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2501         entries = dst_entries_get_slow(ops);
2502         if (entries < ops->gc_thresh)
2503                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2504 out:
2505         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2506         return entries > rt_max_size;
2507 }
2508
2509 static int ip6_convert_metrics(struct mx6_config *mxc,
2510                                const struct fib6_config *cfg)
2511 {
2512         struct net *net = cfg->fc_nlinfo.nl_net;
2513         bool ecn_ca = false;
2514         struct nlattr *nla;
2515         int remaining;
2516         u32 *mp;
2517
2518         if (!cfg->fc_mx)
2519                 return 0;
2520
2521         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2522         if (unlikely(!mp))
2523                 return -ENOMEM;
2524
2525         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2526                 int type = nla_type(nla);
2527                 u32 val;
2528
2529                 if (!type)
2530                         continue;
2531                 if (unlikely(type > RTAX_MAX))
2532                         goto err;
2533
2534                 if (type == RTAX_CC_ALGO) {
2535                         char tmp[TCP_CA_NAME_MAX];
2536
2537                         nla_strlcpy(tmp, nla, sizeof(tmp));
2538                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2539                         if (val == TCP_CA_UNSPEC)
2540                                 goto err;
2541                 } else {
2542                         val = nla_get_u32(nla);
2543                 }
2544                 if (type == RTAX_HOPLIMIT && val > 255)
2545                         val = 255;
2546                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2547                         goto err;
2548
2549                 mp[type - 1] = val;
2550                 __set_bit(type - 1, mxc->mx_valid);
2551         }
2552
2553         if (ecn_ca) {
2554                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2555                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2556         }
2557
2558         mxc->mx = mp;
2559         return 0;
2560  err:
2561         kfree(mp);
2562         return -EINVAL;
2563 }
2564
2565 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2566                                             struct fib6_config *cfg,
2567                                             const struct in6_addr *gw_addr,
2568                                             u32 tbid, int flags)
2569 {
2570         struct flowi6 fl6 = {
2571                 .flowi6_oif = cfg->fc_ifindex,
2572                 .daddr = *gw_addr,
2573                 .saddr = cfg->fc_prefsrc,
2574         };
2575         struct fib6_table *table;
2576         struct rt6_info *rt;
2577
2578         table = fib6_get_table(net, tbid);
2579         if (!table)
2580                 return NULL;
2581
2582         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2583                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2584
2585         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2586         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2587
2588         /* if table lookup failed, fall back to full lookup */
2589         if (rt == net->ipv6.ip6_null_entry) {
2590                 ip6_rt_put(rt);
2591                 rt = NULL;
2592         }
2593
2594         return rt;
2595 }
2596
2597 static int ip6_route_check_nh_onlink(struct net *net,
2598                                      struct fib6_config *cfg,
2599                                      const struct net_device *dev,
2600                                      struct netlink_ext_ack *extack)
2601 {
2602         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2603         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2604         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2605         struct rt6_info *grt;
2606         int err;
2607
2608         err = 0;
2609         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2610         if (grt) {
2611                 if (!grt->dst.error &&
2612                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2613                         NL_SET_ERR_MSG(extack,
2614                                        "Nexthop has invalid gateway or device mismatch");
2615                         err = -EINVAL;
2616                 }
2617
2618                 ip6_rt_put(grt);
2619         }
2620
2621         return err;
2622 }
2623
2624 static int ip6_route_check_nh(struct net *net,
2625                               struct fib6_config *cfg,
2626                               struct net_device **_dev,
2627                               struct inet6_dev **idev)
2628 {
2629         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2630         struct net_device *dev = _dev ? *_dev : NULL;
2631         struct rt6_info *grt = NULL;
2632         int err = -EHOSTUNREACH;
2633
2634         if (cfg->fc_table) {
2635                 int flags = RT6_LOOKUP_F_IFACE;
2636
2637                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2638                                           cfg->fc_table, flags);
2639                 if (grt) {
2640                         if (grt->rt6i_flags & RTF_GATEWAY ||
2641                             (dev && dev != grt->dst.dev)) {
2642                                 ip6_rt_put(grt);
2643                                 grt = NULL;
2644                         }
2645                 }
2646         }
2647
2648         if (!grt)
2649                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2650
2651         if (!grt)
2652                 goto out;
2653
2654         if (dev) {
2655                 if (dev != grt->dst.dev) {
2656                         ip6_rt_put(grt);
2657                         goto out;
2658                 }
2659         } else {
2660                 *_dev = dev = grt->dst.dev;
2661                 *idev = grt->rt6i_idev;
2662                 dev_hold(dev);
2663                 in6_dev_hold(grt->rt6i_idev);
2664         }
2665
2666         if (!(grt->rt6i_flags & RTF_GATEWAY))
2667                 err = 0;
2668
2669         ip6_rt_put(grt);
2670
2671 out:
2672         return err;
2673 }
2674
2675 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2676                            struct net_device **_dev, struct inet6_dev **idev,
2677                            struct netlink_ext_ack *extack)
2678 {
2679         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2680         int gwa_type = ipv6_addr_type(gw_addr);
2681         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2682         const struct net_device *dev = *_dev;
2683         bool need_addr_check = !dev;
2684         int err = -EINVAL;
2685
2686         /* if gw_addr is local we will fail to detect this in case
2687          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2688          * will return already-added prefix route via interface that
2689          * prefix route was assigned to, which might be non-loopback.
2690          */
2691         if (dev &&
2692             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2693                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2694                 goto out;
2695         }
2696
2697         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2698                 /* IPv6 strictly inhibits using not link-local
2699                  * addresses as nexthop address.
2700                  * Otherwise, router will not able to send redirects.
2701                  * It is very good, but in some (rare!) circumstances
2702                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2703                  * some exceptions. --ANK
2704                  * We allow IPv4-mapped nexthops to support RFC4798-type
2705                  * addressing
2706                  */
2707                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2708                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2709                         goto out;
2710                 }
2711
2712                 if (cfg->fc_flags & RTNH_F_ONLINK)
2713                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2714                 else
2715                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2716
2717                 if (err)
2718                         goto out;
2719         }
2720
2721         /* reload in case device was changed */
2722         dev = *_dev;
2723
2724         err = -EINVAL;
2725         if (!dev) {
2726                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2727                 goto out;
2728         } else if (dev->flags & IFF_LOOPBACK) {
2729                 NL_SET_ERR_MSG(extack,
2730                                "Egress device can not be loopback device for this route");
2731                 goto out;
2732         }
2733
2734         /* if we did not check gw_addr above, do so now that the
2735          * egress device has been resolved.
2736          */
2737         if (need_addr_check &&
2738             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2739                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2740                 goto out;
2741         }
2742
2743         err = 0;
2744 out:
2745         return err;
2746 }
2747
2748 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2749                                               struct netlink_ext_ack *extack)
2750 {
2751         struct net *net = cfg->fc_nlinfo.nl_net;
2752         struct rt6_info *rt = NULL;
2753         struct net_device *dev = NULL;
2754         struct inet6_dev *idev = NULL;
2755         struct fib6_table *table;
2756         int addr_type;
2757         int err = -EINVAL;
2758
2759         /* RTF_PCPU is an internal flag; can not be set by userspace */
2760         if (cfg->fc_flags & RTF_PCPU) {
2761                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2762                 goto out;
2763         }
2764
2765         /* RTF_CACHE is an internal flag; can not be set by userspace */
2766         if (cfg->fc_flags & RTF_CACHE) {
2767                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2768                 goto out;
2769         }
2770
2771         if (cfg->fc_dst_len > 128) {
2772                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2773                 goto out;
2774         }
2775         if (cfg->fc_src_len > 128) {
2776                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2777                 goto out;
2778         }
2779 #ifndef CONFIG_IPV6_SUBTREES
2780         if (cfg->fc_src_len) {
2781                 NL_SET_ERR_MSG(extack,
2782                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2783                 goto out;
2784         }
2785 #endif
2786         if (cfg->fc_ifindex) {
2787                 err = -ENODEV;
2788                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2789                 if (!dev)
2790                         goto out;
2791                 idev = in6_dev_get(dev);
2792                 if (!idev)
2793                         goto out;
2794         }
2795
2796         if (cfg->fc_metric == 0)
2797                 cfg->fc_metric = IP6_RT_PRIO_USER;
2798
2799         if (cfg->fc_flags & RTNH_F_ONLINK) {
2800                 if (!dev) {
2801                         NL_SET_ERR_MSG(extack,
2802                                        "Nexthop device required for onlink");
2803                         err = -ENODEV;
2804                         goto out;
2805                 }
2806
2807                 if (!(dev->flags & IFF_UP)) {
2808                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2809                         err = -ENETDOWN;
2810                         goto out;
2811                 }
2812         }
2813
2814         err = -ENOBUFS;
2815         if (cfg->fc_nlinfo.nlh &&
2816             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2817                 table = fib6_get_table(net, cfg->fc_table);
2818                 if (!table) {
2819                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2820                         table = fib6_new_table(net, cfg->fc_table);
2821                 }
2822         } else {
2823                 table = fib6_new_table(net, cfg->fc_table);
2824         }
2825
2826         if (!table)
2827                 goto out;
2828
2829         rt = ip6_dst_alloc(net, NULL,
2830                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2831
2832         if (!rt) {
2833                 err = -ENOMEM;
2834                 goto out;
2835         }
2836
2837         if (cfg->fc_flags & RTF_EXPIRES)
2838                 rt6_set_expires(rt, jiffies +
2839                                 clock_t_to_jiffies(cfg->fc_expires));
2840         else
2841                 rt6_clean_expires(rt);
2842
2843         if (cfg->fc_protocol == RTPROT_UNSPEC)
2844                 cfg->fc_protocol = RTPROT_BOOT;
2845         rt->rt6i_protocol = cfg->fc_protocol;
2846
2847         addr_type = ipv6_addr_type(&cfg->fc_dst);
2848
2849         if (addr_type & IPV6_ADDR_MULTICAST)
2850                 rt->dst.input = ip6_mc_input;
2851         else if (cfg->fc_flags & RTF_LOCAL)
2852                 rt->dst.input = ip6_input;
2853         else
2854                 rt->dst.input = ip6_forward;
2855
2856         rt->dst.output = ip6_output;
2857
2858         if (cfg->fc_encap) {
2859                 struct lwtunnel_state *lwtstate;
2860
2861                 err = lwtunnel_build_state(cfg->fc_encap_type,
2862                                            cfg->fc_encap, AF_INET6, cfg,
2863                                            &lwtstate, extack);
2864                 if (err)
2865                         goto out;
2866                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2867                 lwtunnel_set_redirect(&rt->dst);
2868         }
2869
2870         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2871         rt->rt6i_dst.plen = cfg->fc_dst_len;
2872         if (rt->rt6i_dst.plen == 128)
2873                 rt->dst.flags |= DST_HOST;
2874
2875 #ifdef CONFIG_IPV6_SUBTREES
2876         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2877         rt->rt6i_src.plen = cfg->fc_src_len;
2878 #endif
2879
2880         rt->rt6i_metric = cfg->fc_metric;
2881         rt->rt6i_nh_weight = 1;
2882
2883         /* We cannot add true routes via loopback here,
2884            they would result in kernel looping; promote them to reject routes
2885          */
2886         if ((cfg->fc_flags & RTF_REJECT) ||
2887             (dev && (dev->flags & IFF_LOOPBACK) &&
2888              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2889              !(cfg->fc_flags & RTF_LOCAL))) {
2890                 /* hold loopback dev/idev if we haven't done so. */
2891                 if (dev != net->loopback_dev) {
2892                         if (dev) {
2893                                 dev_put(dev);
2894                                 in6_dev_put(idev);
2895                         }
2896                         dev = net->loopback_dev;
2897                         dev_hold(dev);
2898                         idev = in6_dev_get(dev);
2899                         if (!idev) {
2900                                 err = -ENODEV;
2901                                 goto out;
2902                         }
2903                 }
2904                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2905                 switch (cfg->fc_type) {
2906                 case RTN_BLACKHOLE:
2907                         rt->dst.error = -EINVAL;
2908                         rt->dst.output = dst_discard_out;
2909                         rt->dst.input = dst_discard;
2910                         break;
2911                 case RTN_PROHIBIT:
2912                         rt->dst.error = -EACCES;
2913                         rt->dst.output = ip6_pkt_prohibit_out;
2914                         rt->dst.input = ip6_pkt_prohibit;
2915                         break;
2916                 case RTN_THROW:
2917                 case RTN_UNREACHABLE:
2918                 default:
2919                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2920                                         : (cfg->fc_type == RTN_UNREACHABLE)
2921                                         ? -EHOSTUNREACH : -ENETUNREACH;
2922                         rt->dst.output = ip6_pkt_discard_out;
2923                         rt->dst.input = ip6_pkt_discard;
2924                         break;
2925                 }
2926                 goto install_route;
2927         }
2928
2929         if (cfg->fc_flags & RTF_GATEWAY) {
2930                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2931                 if (err)
2932                         goto out;
2933
2934                 rt->rt6i_gateway = cfg->fc_gateway;
2935         }
2936
2937         err = -ENODEV;
2938         if (!dev)
2939                 goto out;
2940
2941         if (idev->cnf.disable_ipv6) {
2942                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2943                 err = -EACCES;
2944                 goto out;
2945         }
2946
2947         if (!(dev->flags & IFF_UP)) {
2948                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2949                 err = -ENETDOWN;
2950                 goto out;
2951         }
2952
2953         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2954                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2955                         NL_SET_ERR_MSG(extack, "Invalid source address");
2956                         err = -EINVAL;
2957                         goto out;
2958                 }
2959                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2960                 rt->rt6i_prefsrc.plen = 128;
2961         } else
2962                 rt->rt6i_prefsrc.plen = 0;
2963
2964         rt->rt6i_flags = cfg->fc_flags;
2965
2966 install_route:
2967         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2968             !netif_carrier_ok(dev))
2969                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2970         rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2971         rt->dst.dev = dev;
2972         rt->rt6i_idev = idev;
2973         rt->rt6i_table = table;
2974
2975         cfg->fc_nlinfo.nl_net = dev_net(dev);
2976
2977         return rt;
2978 out:
2979         if (dev)
2980                 dev_put(dev);
2981         if (idev)
2982                 in6_dev_put(idev);
2983         if (rt)
2984                 dst_release_immediate(&rt->dst);
2985
2986         return ERR_PTR(err);
2987 }
2988
2989 int ip6_route_add(struct fib6_config *cfg,
2990                   struct netlink_ext_ack *extack)
2991 {
2992         struct mx6_config mxc = { .mx = NULL, };
2993         struct rt6_info *rt;
2994         int err;
2995
2996         rt = ip6_route_info_create(cfg, extack);
2997         if (IS_ERR(rt)) {
2998                 err = PTR_ERR(rt);
2999                 rt = NULL;
3000                 goto out;
3001         }
3002
3003         err = ip6_convert_metrics(&mxc, cfg);
3004         if (err)
3005                 goto out;
3006
3007         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
3008
3009         kfree(mxc.mx);
3010
3011         return err;
3012 out:
3013         if (rt)
3014                 dst_release_immediate(&rt->dst);
3015
3016         return err;
3017 }
3018
3019 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3020 {
3021         int err;
3022         struct fib6_table *table;
3023         struct net *net = dev_net(rt->dst.dev);
3024
3025         if (rt == net->ipv6.ip6_null_entry) {
3026                 err = -ENOENT;
3027                 goto out;
3028         }
3029
3030         table = rt->rt6i_table;
3031         spin_lock_bh(&table->tb6_lock);
3032         err = fib6_del(rt, info);
3033         spin_unlock_bh(&table->tb6_lock);
3034
3035 out:
3036         ip6_rt_put(rt);
3037         return err;
3038 }
3039
3040 int ip6_del_rt(struct rt6_info *rt)
3041 {
3042         struct nl_info info = {
3043                 .nl_net = dev_net(rt->dst.dev),
3044         };
3045         return __ip6_del_rt(rt, &info);
3046 }
3047
3048 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3049 {
3050         struct nl_info *info = &cfg->fc_nlinfo;
3051         struct net *net = info->nl_net;
3052         struct sk_buff *skb = NULL;
3053         struct fib6_table *table;
3054         int err = -ENOENT;
3055
3056         if (rt == net->ipv6.ip6_null_entry)
3057                 goto out_put;
3058         table = rt->rt6i_table;
3059         spin_lock_bh(&table->tb6_lock);
3060
3061         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3062                 struct rt6_info *sibling, *next_sibling;
3063
3064                 /* prefer to send a single notification with all hops */
3065                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3066                 if (skb) {
3067                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3068
3069                         if (rt6_fill_node(net, skb, rt,
3070                                           NULL, NULL, 0, RTM_DELROUTE,
3071                                           info->portid, seq, 0) < 0) {
3072                                 kfree_skb(skb);
3073                                 skb = NULL;
3074                         } else
3075                                 info->skip_notify = 1;
3076                 }
3077
3078                 list_for_each_entry_safe(sibling, next_sibling,
3079                                          &rt->rt6i_siblings,
3080                                          rt6i_siblings) {
3081                         err = fib6_del(sibling, info);
3082                         if (err)
3083                                 goto out_unlock;
3084                 }
3085         }
3086
3087         err = fib6_del(rt, info);
3088 out_unlock:
3089         spin_unlock_bh(&table->tb6_lock);
3090 out_put:
3091         ip6_rt_put(rt);
3092
3093         if (skb) {
3094                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3095                             info->nlh, gfp_any());
3096         }
3097         return err;
3098 }
3099
3100 static int ip6_route_del(struct fib6_config *cfg,
3101                          struct netlink_ext_ack *extack)
3102 {
3103         struct rt6_info *rt, *rt_cache;
3104         struct fib6_table *table;
3105         struct fib6_node *fn;
3106         int err = -ESRCH;
3107
3108         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3109         if (!table) {
3110                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3111                 return err;
3112         }
3113
3114         rcu_read_lock();
3115
3116         fn = fib6_locate(&table->tb6_root,
3117                          &cfg->fc_dst, cfg->fc_dst_len,
3118                          &cfg->fc_src, cfg->fc_src_len,
3119                          !(cfg->fc_flags & RTF_CACHE));
3120
3121         if (fn) {
3122                 for_each_fib6_node_rt_rcu(fn) {
3123                         if (cfg->fc_flags & RTF_CACHE) {
3124                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3125                                                               &cfg->fc_src);
3126                                 if (!rt_cache)
3127                                         continue;
3128                                 rt = rt_cache;
3129                         }
3130                         if (cfg->fc_ifindex &&
3131                             (!rt->dst.dev ||
3132                              rt->dst.dev->ifindex != cfg->fc_ifindex))
3133                                 continue;
3134                         if (cfg->fc_flags & RTF_GATEWAY &&
3135                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3136                                 continue;
3137                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3138                                 continue;
3139                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3140                                 continue;
3141                         if (!dst_hold_safe(&rt->dst))
3142                                 break;
3143                         rcu_read_unlock();
3144
3145                         /* if gateway was specified only delete the one hop */
3146                         if (cfg->fc_flags & RTF_GATEWAY)
3147                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3148
3149                         return __ip6_del_rt_siblings(rt, cfg);
3150                 }
3151         }
3152         rcu_read_unlock();
3153
3154         return err;
3155 }
3156
3157 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3158 {
3159         struct netevent_redirect netevent;
3160         struct rt6_info *rt, *nrt = NULL;
3161         struct ndisc_options ndopts;
3162         struct inet6_dev *in6_dev;
3163         struct neighbour *neigh;
3164         struct rd_msg *msg;
3165         int optlen, on_link;
3166         u8 *lladdr;
3167
3168         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3169         optlen -= sizeof(*msg);
3170
3171         if (optlen < 0) {
3172                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3173                 return;
3174         }
3175
3176         msg = (struct rd_msg *)icmp6_hdr(skb);
3177
3178         if (ipv6_addr_is_multicast(&msg->dest)) {
3179                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3180                 return;
3181         }
3182
3183         on_link = 0;
3184         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3185                 on_link = 1;
3186         } else if (ipv6_addr_type(&msg->target) !=
3187                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3188                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3189                 return;
3190         }
3191
3192         in6_dev = __in6_dev_get(skb->dev);
3193         if (!in6_dev)
3194                 return;
3195         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3196                 return;
3197
3198         /* RFC2461 8.1:
3199          *      The IP source address of the Redirect MUST be the same as the current
3200          *      first-hop router for the specified ICMP Destination Address.
3201          */
3202
3203         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3204                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3205                 return;
3206         }
3207
3208         lladdr = NULL;
3209         if (ndopts.nd_opts_tgt_lladdr) {
3210                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3211                                              skb->dev);
3212                 if (!lladdr) {
3213                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3214                         return;
3215                 }
3216         }
3217
3218         rt = (struct rt6_info *) dst;
3219         if (rt->rt6i_flags & RTF_REJECT) {
3220                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3221                 return;
3222         }
3223
3224         /* Redirect received -> path was valid.
3225          * Look, redirects are sent only in response to data packets,
3226          * so that this nexthop apparently is reachable. --ANK
3227          */
3228         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3229
3230         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3231         if (!neigh)
3232                 return;
3233
3234         /*
3235          *      We have finally decided to accept it.
3236          */
3237
3238         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3239                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3240                      NEIGH_UPDATE_F_OVERRIDE|
3241                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3242                                      NEIGH_UPDATE_F_ISROUTER)),
3243                      NDISC_REDIRECT, &ndopts);
3244
3245         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3246         if (!nrt)
3247                 goto out;
3248
3249         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3250         if (on_link)
3251                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3252
3253         nrt->rt6i_protocol = RTPROT_REDIRECT;
3254         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3255
3256         /* No need to remove rt from the exception table if rt is
3257          * a cached route because rt6_insert_exception() will
3258          * takes care of it
3259          */
3260         if (rt6_insert_exception(nrt, rt)) {
3261                 dst_release_immediate(&nrt->dst);
3262                 goto out;
3263         }
3264
3265         netevent.old = &rt->dst;
3266         netevent.new = &nrt->dst;
3267         netevent.daddr = &msg->dest;
3268         netevent.neigh = neigh;
3269         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3270
3271 out:
3272         neigh_release(neigh);
3273 }
3274
3275 /*
3276  *      Misc support functions
3277  */
3278
3279 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3280 {
3281         BUG_ON(from->from);
3282
3283         rt->rt6i_flags &= ~RTF_EXPIRES;
3284         dst_hold(&from->dst);
3285         rt->from = from;
3286         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3287 }
3288
3289 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3290 {
3291         rt->dst.input = ort->dst.input;
3292         rt->dst.output = ort->dst.output;
3293         rt->rt6i_dst = ort->rt6i_dst;
3294         rt->dst.error = ort->dst.error;
3295         rt->rt6i_idev = ort->rt6i_idev;
3296         if (rt->rt6i_idev)
3297                 in6_dev_hold(rt->rt6i_idev);
3298         rt->dst.lastuse = jiffies;
3299         rt->rt6i_gateway = ort->rt6i_gateway;
3300         rt->rt6i_flags = ort->rt6i_flags;
3301         rt6_set_from(rt, ort);
3302         rt->rt6i_metric = ort->rt6i_metric;
3303 #ifdef CONFIG_IPV6_SUBTREES
3304         rt->rt6i_src = ort->rt6i_src;
3305 #endif
3306         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3307         rt->rt6i_table = ort->rt6i_table;
3308         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3309 }
3310
3311 #ifdef CONFIG_IPV6_ROUTE_INFO
3312 static struct rt6_info *rt6_get_route_info(struct net *net,
3313                                            const struct in6_addr *prefix, int prefixlen,
3314                                            const struct in6_addr *gwaddr,
3315                                            struct net_device *dev)
3316 {
3317         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3318         int ifindex = dev->ifindex;
3319         struct fib6_node *fn;
3320         struct rt6_info *rt = NULL;
3321         struct fib6_table *table;
3322
3323         table = fib6_get_table(net, tb_id);
3324         if (!table)
3325                 return NULL;
3326
3327         rcu_read_lock();
3328         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3329         if (!fn)
3330                 goto out;
3331
3332         for_each_fib6_node_rt_rcu(fn) {
3333                 if (rt->dst.dev->ifindex != ifindex)
3334                         continue;
3335                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3336                         continue;
3337                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3338                         continue;
3339                 ip6_hold_safe(NULL, &rt, false);
3340                 break;
3341         }
3342 out:
3343         rcu_read_unlock();
3344         return rt;
3345 }
3346
3347 static struct rt6_info *rt6_add_route_info(struct net *net,
3348                                            const struct in6_addr *prefix, int prefixlen,
3349                                            const struct in6_addr *gwaddr,
3350                                            struct net_device *dev,
3351                                            unsigned int pref)
3352 {
3353         struct fib6_config cfg = {
3354                 .fc_metric      = IP6_RT_PRIO_USER,
3355                 .fc_ifindex     = dev->ifindex,
3356                 .fc_dst_len     = prefixlen,
3357                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3358                                   RTF_UP | RTF_PREF(pref),
3359                 .fc_protocol = RTPROT_RA,
3360                 .fc_nlinfo.portid = 0,
3361                 .fc_nlinfo.nlh = NULL,
3362                 .fc_nlinfo.nl_net = net,
3363         };
3364
3365         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3366         cfg.fc_dst = *prefix;
3367         cfg.fc_gateway = *gwaddr;
3368
3369         /* We should treat it as a default route if prefix length is 0. */
3370         if (!prefixlen)
3371                 cfg.fc_flags |= RTF_DEFAULT;
3372
3373         ip6_route_add(&cfg, NULL);
3374
3375         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3376 }
3377 #endif
3378
3379 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3380 {
3381         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3382         struct rt6_info *rt;
3383         struct fib6_table *table;
3384
3385         table = fib6_get_table(dev_net(dev), tb_id);
3386         if (!table)
3387                 return NULL;
3388
3389         rcu_read_lock();
3390         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3391                 if (dev == rt->dst.dev &&
3392                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3393                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3394                         break;
3395         }
3396         if (rt)
3397                 ip6_hold_safe(NULL, &rt, false);
3398         rcu_read_unlock();
3399         return rt;
3400 }
3401
3402 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3403                                      struct net_device *dev,
3404                                      unsigned int pref)
3405 {
3406         struct fib6_config cfg = {
3407                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3408                 .fc_metric      = IP6_RT_PRIO_USER,
3409                 .fc_ifindex     = dev->ifindex,
3410                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3411                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3412                 .fc_protocol = RTPROT_RA,
3413                 .fc_nlinfo.portid = 0,
3414                 .fc_nlinfo.nlh = NULL,
3415                 .fc_nlinfo.nl_net = dev_net(dev),
3416         };
3417
3418         cfg.fc_gateway = *gwaddr;
3419
3420         if (!ip6_route_add(&cfg, NULL)) {
3421                 struct fib6_table *table;
3422
3423                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3424                 if (table)
3425                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3426         }
3427
3428         return rt6_get_dflt_router(gwaddr, dev);
3429 }
3430
3431 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3432 {
3433         struct rt6_info *rt;
3434
3435 restart:
3436         rcu_read_lock();
3437         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3438                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3439                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3440                         if (dst_hold_safe(&rt->dst)) {
3441                                 rcu_read_unlock();
3442                                 ip6_del_rt(rt);
3443                         } else {
3444                                 rcu_read_unlock();
3445                         }
3446                         goto restart;
3447                 }
3448         }
3449         rcu_read_unlock();
3450
3451         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3452 }
3453
3454 void rt6_purge_dflt_routers(struct net *net)
3455 {
3456         struct fib6_table *table;
3457         struct hlist_head *head;
3458         unsigned int h;
3459
3460         rcu_read_lock();
3461
3462         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3463                 head = &net->ipv6.fib_table_hash[h];
3464                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3465                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3466                                 __rt6_purge_dflt_routers(table);
3467                 }
3468         }
3469
3470         rcu_read_unlock();
3471 }
3472
3473 static void rtmsg_to_fib6_config(struct net *net,
3474                                  struct in6_rtmsg *rtmsg,
3475                                  struct fib6_config *cfg)
3476 {
3477         memset(cfg, 0, sizeof(*cfg));
3478
3479         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3480                          : RT6_TABLE_MAIN;
3481         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3482         cfg->fc_metric = rtmsg->rtmsg_metric;
3483         cfg->fc_expires = rtmsg->rtmsg_info;
3484         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3485         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3486         cfg->fc_flags = rtmsg->rtmsg_flags;
3487
3488         cfg->fc_nlinfo.nl_net = net;
3489
3490         cfg->fc_dst = rtmsg->rtmsg_dst;
3491         cfg->fc_src = rtmsg->rtmsg_src;
3492         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3493 }
3494
3495 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3496 {
3497         struct fib6_config cfg;
3498         struct in6_rtmsg rtmsg;
3499         int err;
3500
3501         switch (cmd) {
3502         case SIOCADDRT:         /* Add a route */
3503         case SIOCDELRT:         /* Delete a route */
3504                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3505                         return -EPERM;
3506                 err = copy_from_user(&rtmsg, arg,
3507                                      sizeof(struct in6_rtmsg));
3508                 if (err)
3509                         return -EFAULT;
3510
3511                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3512
3513                 rtnl_lock();
3514                 switch (cmd) {
3515                 case SIOCADDRT:
3516                         err = ip6_route_add(&cfg, NULL);
3517                         break;
3518                 case SIOCDELRT:
3519                         err = ip6_route_del(&cfg, NULL);
3520                         break;
3521                 default:
3522                         err = -EINVAL;
3523                 }
3524                 rtnl_unlock();
3525
3526                 return err;
3527         }
3528
3529         return -EINVAL;
3530 }
3531
3532 /*
3533  *      Drop the packet on the floor
3534  */
3535
3536 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3537 {
3538         int type;
3539         struct dst_entry *dst = skb_dst(skb);
3540         switch (ipstats_mib_noroutes) {
3541         case IPSTATS_MIB_INNOROUTES:
3542                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3543                 if (type == IPV6_ADDR_ANY) {
3544                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3545                                       IPSTATS_MIB_INADDRERRORS);
3546                         break;
3547                 }
3548                 /* FALLTHROUGH */
3549         case IPSTATS_MIB_OUTNOROUTES:
3550                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3551                               ipstats_mib_noroutes);
3552                 break;
3553         }
3554         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3555         kfree_skb(skb);
3556         return 0;
3557 }
3558
3559 static int ip6_pkt_discard(struct sk_buff *skb)
3560 {
3561         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3562 }
3563
3564 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3565 {
3566         skb->dev = skb_dst(skb)->dev;
3567         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3568 }
3569
3570 static int ip6_pkt_prohibit(struct sk_buff *skb)
3571 {
3572         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3573 }
3574
3575 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3576 {
3577         skb->dev = skb_dst(skb)->dev;
3578         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3579 }
3580
3581 /*
3582  *      Allocate a dst for local (unicast / anycast) address.
3583  */
3584
3585 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3586                                     const struct in6_addr *addr,
3587                                     bool anycast)
3588 {
3589         u32 tb_id;
3590         struct net *net = dev_net(idev->dev);
3591         struct net_device *dev = idev->dev;
3592         struct rt6_info *rt;
3593
3594         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3595         if (!rt)
3596                 return ERR_PTR(-ENOMEM);
3597
3598         in6_dev_hold(idev);
3599
3600         rt->dst.flags |= DST_HOST;
3601         rt->dst.input = ip6_input;
3602         rt->dst.output = ip6_output;
3603         rt->rt6i_idev = idev;
3604
3605         rt->rt6i_protocol = RTPROT_KERNEL;
3606         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3607         if (anycast)
3608                 rt->rt6i_flags |= RTF_ANYCAST;
3609         else
3610                 rt->rt6i_flags |= RTF_LOCAL;
3611
3612         rt->rt6i_gateway  = *addr;
3613         rt->rt6i_dst.addr = *addr;
3614         rt->rt6i_dst.plen = 128;
3615         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3616         rt->rt6i_table = fib6_get_table(net, tb_id);
3617
3618         return rt;
3619 }
3620
3621 /* remove deleted ip from prefsrc entries */
3622 struct arg_dev_net_ip {
3623         struct net_device *dev;
3624         struct net *net;
3625         struct in6_addr *addr;
3626 };
3627
3628 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3629 {
3630         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3631         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3632         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3633
3634         if (((void *)rt->dst.dev == dev || !dev) &&
3635             rt != net->ipv6.ip6_null_entry &&
3636             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3637                 spin_lock_bh(&rt6_exception_lock);
3638                 /* remove prefsrc entry */
3639                 rt->rt6i_prefsrc.plen = 0;
3640                 /* need to update cache as well */
3641                 rt6_exceptions_remove_prefsrc(rt);
3642                 spin_unlock_bh(&rt6_exception_lock);
3643         }
3644         return 0;
3645 }
3646
3647 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3648 {
3649         struct net *net = dev_net(ifp->idev->dev);
3650         struct arg_dev_net_ip adni = {
3651                 .dev = ifp->idev->dev,
3652                 .net = net,
3653                 .addr = &ifp->addr,
3654         };
3655         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3656 }
3657
3658 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3659
3660 /* Remove routers and update dst entries when gateway turn into host. */
3661 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3662 {
3663         struct in6_addr *gateway = (struct in6_addr *)arg;
3664
3665         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3666             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3667                 return -1;
3668         }
3669
3670         /* Further clean up cached routes in exception table.
3671          * This is needed because cached route may have a different
3672          * gateway than its 'parent' in the case of an ip redirect.
3673          */
3674         rt6_exceptions_clean_tohost(rt, gateway);
3675
3676         return 0;
3677 }
3678
3679 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3680 {
3681         fib6_clean_all(net, fib6_clean_tohost, gateway);
3682 }
3683
3684 struct arg_netdev_event {
3685         const struct net_device *dev;
3686         union {
3687                 unsigned int nh_flags;
3688                 unsigned long event;
3689         };
3690 };
3691
3692 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3693 {
3694         struct rt6_info *iter;
3695         struct fib6_node *fn;
3696
3697         fn = rcu_dereference_protected(rt->rt6i_node,
3698                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3699         iter = rcu_dereference_protected(fn->leaf,
3700                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3701         while (iter) {
3702                 if (iter->rt6i_metric == rt->rt6i_metric &&
3703                     rt6_qualify_for_ecmp(iter))
3704                         return iter;
3705                 iter = rcu_dereference_protected(iter->rt6_next,
3706                                 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3707         }
3708
3709         return NULL;
3710 }
3711
3712 static bool rt6_is_dead(const struct rt6_info *rt)
3713 {
3714         if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3715             (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3716              rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3717                 return true;
3718
3719         return false;
3720 }
3721
3722 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3723 {
3724         struct rt6_info *iter;
3725         int total = 0;
3726
3727         if (!rt6_is_dead(rt))
3728                 total += rt->rt6i_nh_weight;
3729
3730         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3731                 if (!rt6_is_dead(iter))
3732                         total += iter->rt6i_nh_weight;
3733         }
3734
3735         return total;
3736 }
3737
3738 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3739 {
3740         int upper_bound = -1;
3741
3742         if (!rt6_is_dead(rt)) {
3743                 *weight += rt->rt6i_nh_weight;
3744                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3745                                                     total) - 1;
3746         }
3747         atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3748 }
3749
3750 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3751 {
3752         struct rt6_info *iter;
3753         int weight = 0;
3754
3755         rt6_upper_bound_set(rt, &weight, total);
3756
3757         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3758                 rt6_upper_bound_set(iter, &weight, total);
3759 }
3760
3761 void rt6_multipath_rebalance(struct rt6_info *rt)
3762 {
3763         struct rt6_info *first;
3764         int total;
3765
3766         /* In case the entire multipath route was marked for flushing,
3767          * then there is no need to rebalance upon the removal of every
3768          * sibling route.
3769          */
3770         if (!rt->rt6i_nsiblings || rt->should_flush)
3771                 return;
3772
3773         /* During lookup routes are evaluated in order, so we need to
3774          * make sure upper bounds are assigned from the first sibling
3775          * onwards.
3776          */
3777         first = rt6_multipath_first_sibling(rt);
3778         if (WARN_ON_ONCE(!first))
3779                 return;
3780
3781         total = rt6_multipath_total_weight(first);
3782         rt6_multipath_upper_bound_set(first, total);
3783 }
3784
3785 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3786 {
3787         const struct arg_netdev_event *arg = p_arg;
3788         const struct net *net = dev_net(arg->dev);
3789
3790         if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3791                 rt->rt6i_nh_flags &= ~arg->nh_flags;
3792                 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3793                 rt6_multipath_rebalance(rt);
3794         }
3795
3796         return 0;
3797 }
3798
3799 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3800 {
3801         struct arg_netdev_event arg = {
3802                 .dev = dev,
3803                 {
3804                         .nh_flags = nh_flags,
3805                 },
3806         };
3807
3808         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3809                 arg.nh_flags |= RTNH_F_LINKDOWN;
3810
3811         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3812 }
3813
3814 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3815                                    const struct net_device *dev)
3816 {
3817         struct rt6_info *iter;
3818
3819         if (rt->dst.dev == dev)
3820                 return true;
3821         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3822                 if (iter->dst.dev == dev)
3823                         return true;
3824
3825         return false;
3826 }
3827
3828 static void rt6_multipath_flush(struct rt6_info *rt)
3829 {
3830         struct rt6_info *iter;
3831
3832         rt->should_flush = 1;
3833         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3834                 iter->should_flush = 1;
3835 }
3836
3837 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3838                                              const struct net_device *down_dev)
3839 {
3840         struct rt6_info *iter;
3841         unsigned int dead = 0;
3842
3843         if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3844                 dead++;
3845         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3846                 if (iter->dst.dev == down_dev ||
3847                     iter->rt6i_nh_flags & RTNH_F_DEAD)
3848                         dead++;
3849
3850         return dead;
3851 }
3852
3853 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3854                                        const struct net_device *dev,
3855                                        unsigned int nh_flags)
3856 {
3857         struct rt6_info *iter;
3858
3859         if (rt->dst.dev == dev)
3860                 rt->rt6i_nh_flags |= nh_flags;
3861         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3862                 if (iter->dst.dev == dev)
3863                         iter->rt6i_nh_flags |= nh_flags;
3864 }
3865
3866 /* called with write lock held for table with rt */
3867 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3868 {
3869         const struct arg_netdev_event *arg = p_arg;
3870         const struct net_device *dev = arg->dev;
3871         const struct net *net = dev_net(dev);
3872
3873         if (rt == net->ipv6.ip6_null_entry)
3874                 return 0;
3875
3876         switch (arg->event) {
3877         case NETDEV_UNREGISTER:
3878                 return rt->dst.dev == dev ? -1 : 0;
3879         case NETDEV_DOWN:
3880                 if (rt->should_flush)
3881                         return -1;
3882                 if (!rt->rt6i_nsiblings)
3883                         return rt->dst.dev == dev ? -1 : 0;
3884                 if (rt6_multipath_uses_dev(rt, dev)) {
3885                         unsigned int count;
3886
3887                         count = rt6_multipath_dead_count(rt, dev);
3888                         if (rt->rt6i_nsiblings + 1 == count) {
3889                                 rt6_multipath_flush(rt);
3890                                 return -1;
3891                         }
3892                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3893                                                    RTNH_F_LINKDOWN);
3894                         fib6_update_sernum(rt);
3895                         rt6_multipath_rebalance(rt);
3896                 }
3897                 return -2;
3898         case NETDEV_CHANGE:
3899                 if (rt->dst.dev != dev ||
3900                     rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3901                         break;
3902                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3903                 rt6_multipath_rebalance(rt);
3904                 break;
3905         }
3906
3907         return 0;
3908 }
3909
3910 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3911 {
3912         struct arg_netdev_event arg = {
3913                 .dev = dev,
3914                 {
3915                         .event = event,
3916                 },
3917         };
3918
3919         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3920 }
3921
3922 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3923 {
3924         rt6_sync_down_dev(dev, event);
3925         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3926         neigh_ifdown(&nd_tbl, dev);
3927 }
3928
3929 struct rt6_mtu_change_arg {
3930         struct net_device *dev;
3931         unsigned int mtu;
3932 };
3933
3934 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3935 {
3936         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3937         struct inet6_dev *idev;
3938
3939         /* In IPv6 pmtu discovery is not optional,
3940            so that RTAX_MTU lock cannot disable it.
3941            We still use this lock to block changes
3942            caused by addrconf/ndisc.
3943         */
3944
3945         idev = __in6_dev_get(arg->dev);
3946         if (!idev)
3947                 return 0;
3948
3949         /* For administrative MTU increase, there is no way to discover
3950            IPv6 PMTU increase, so PMTU increase should be updated here.
3951            Since RFC 1981 doesn't include administrative MTU increase
3952            update PMTU increase is a MUST. (i.e. jumbo frame)
3953          */
3954         if (rt->dst.dev == arg->dev &&
3955             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3956                 spin_lock_bh(&rt6_exception_lock);
3957                 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3958                     rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3959                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3960                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3961                 spin_unlock_bh(&rt6_exception_lock);
3962         }
3963         return 0;
3964 }
3965
3966 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3967 {
3968         struct rt6_mtu_change_arg arg = {
3969                 .dev = dev,
3970                 .mtu = mtu,
3971         };
3972
3973         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3974 }
3975
3976 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3977         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3978         [RTA_OIF]               = { .type = NLA_U32 },
3979         [RTA_IIF]               = { .type = NLA_U32 },
3980         [RTA_PRIORITY]          = { .type = NLA_U32 },
3981         [RTA_METRICS]           = { .type = NLA_NESTED },
3982         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3983         [RTA_PREF]              = { .type = NLA_U8 },
3984         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3985         [RTA_ENCAP]             = { .type = NLA_NESTED },
3986         [RTA_EXPIRES]           = { .type = NLA_U32 },
3987         [RTA_UID]               = { .type = NLA_U32 },
3988         [RTA_MARK]              = { .type = NLA_U32 },
3989 };
3990
3991 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3992                               struct fib6_config *cfg,
3993                               struct netlink_ext_ack *extack)
3994 {
3995         struct rtmsg *rtm;
3996         struct nlattr *tb[RTA_MAX+1];
3997         unsigned int pref;
3998         int err;
3999
4000         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4001                           NULL);
4002         if (err < 0)
4003                 goto errout;
4004
4005         err = -EINVAL;
4006         rtm = nlmsg_data(nlh);
4007         memset(cfg, 0, sizeof(*cfg));
4008
4009         cfg->fc_table = rtm->rtm_table;
4010         cfg->fc_dst_len = rtm->rtm_dst_len;
4011         cfg->fc_src_len = rtm->rtm_src_len;
4012         cfg->fc_flags = RTF_UP;
4013         cfg->fc_protocol = rtm->rtm_protocol;
4014         cfg->fc_type = rtm->rtm_type;
4015
4016         if (rtm->rtm_type == RTN_UNREACHABLE ||
4017             rtm->rtm_type == RTN_BLACKHOLE ||
4018             rtm->rtm_type == RTN_PROHIBIT ||
4019             rtm->rtm_type == RTN_THROW)
4020                 cfg->fc_flags |= RTF_REJECT;
4021
4022         if (rtm->rtm_type == RTN_LOCAL)
4023                 cfg->fc_flags |= RTF_LOCAL;
4024
4025         if (rtm->rtm_flags & RTM_F_CLONED)
4026                 cfg->fc_flags |= RTF_CACHE;
4027
4028         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4029
4030         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4031         cfg->fc_nlinfo.nlh = nlh;
4032         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4033
4034         if (tb[RTA_GATEWAY]) {
4035                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4036                 cfg->fc_flags |= RTF_GATEWAY;
4037         }
4038
4039         if (tb[RTA_DST]) {
4040                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4041
4042                 if (nla_len(tb[RTA_DST]) < plen)
4043                         goto errout;
4044
4045                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4046         }
4047
4048         if (tb[RTA_SRC]) {
4049                 int plen = (rtm->rtm_src_len + 7) >> 3;
4050
4051                 if (nla_len(tb[RTA_SRC]) < plen)
4052                         goto errout;
4053
4054                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4055         }
4056
4057         if (tb[RTA_PREFSRC])
4058                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4059
4060         if (tb[RTA_OIF])
4061                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4062
4063         if (tb[RTA_PRIORITY])
4064                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4065
4066         if (tb[RTA_METRICS]) {
4067                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4068                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4069         }
4070
4071         if (tb[RTA_TABLE])
4072                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4073
4074         if (tb[RTA_MULTIPATH]) {
4075                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4076                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4077
4078                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4079                                                      cfg->fc_mp_len, extack);
4080                 if (err < 0)
4081                         goto errout;
4082         }
4083
4084         if (tb[RTA_PREF]) {
4085                 pref = nla_get_u8(tb[RTA_PREF]);
4086                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4087                     pref != ICMPV6_ROUTER_PREF_HIGH)
4088                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4089                 cfg->fc_flags |= RTF_PREF(pref);
4090         }
4091
4092         if (tb[RTA_ENCAP])
4093                 cfg->fc_encap = tb[RTA_ENCAP];
4094
4095         if (tb[RTA_ENCAP_TYPE]) {
4096                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4097
4098                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4099                 if (err < 0)
4100                         goto errout;
4101         }
4102
4103         if (tb[RTA_EXPIRES]) {
4104                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4105
4106                 if (addrconf_finite_timeout(timeout)) {
4107                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4108                         cfg->fc_flags |= RTF_EXPIRES;
4109                 }
4110         }
4111
4112         err = 0;
4113 errout:
4114         return err;
4115 }
4116
4117 struct rt6_nh {
4118         struct rt6_info *rt6_info;
4119         struct fib6_config r_cfg;
4120         struct mx6_config mxc;
4121         struct list_head next;
4122 };
4123
4124 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4125 {
4126         struct rt6_nh *nh;
4127
4128         list_for_each_entry(nh, rt6_nh_list, next) {
4129                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4130                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4131                         nh->r_cfg.fc_ifindex);
4132         }
4133 }
4134
4135 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4136                                  struct rt6_info *rt, struct fib6_config *r_cfg)
4137 {
4138         struct rt6_nh *nh;
4139         int err = -EEXIST;
4140
4141         list_for_each_entry(nh, rt6_nh_list, next) {
4142                 /* check if rt6_info already exists */
4143                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4144                         return err;
4145         }
4146
4147         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4148         if (!nh)
4149                 return -ENOMEM;
4150         nh->rt6_info = rt;
4151         err = ip6_convert_metrics(&nh->mxc, r_cfg);
4152         if (err) {
4153                 kfree(nh);
4154                 return err;
4155         }
4156         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4157         list_add_tail(&nh->next, rt6_nh_list);
4158
4159         return 0;
4160 }
4161
4162 static void ip6_route_mpath_notify(struct rt6_info *rt,
4163                                    struct rt6_info *rt_last,
4164                                    struct nl_info *info,
4165                                    __u16 nlflags)
4166 {
4167         /* if this is an APPEND route, then rt points to the first route
4168          * inserted and rt_last points to last route inserted. Userspace
4169          * wants a consistent dump of the route which starts at the first
4170          * nexthop. Since sibling routes are always added at the end of
4171          * the list, find the first sibling of the last route appended
4172          */
4173         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4174                 rt = list_first_entry(&rt_last->rt6i_siblings,
4175                                       struct rt6_info,
4176                                       rt6i_siblings);
4177         }
4178
4179         if (rt)
4180                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4181 }
4182
4183 static int ip6_route_multipath_add(struct fib6_config *cfg,
4184                                    struct netlink_ext_ack *extack)
4185 {
4186         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4187         struct nl_info *info = &cfg->fc_nlinfo;
4188         struct fib6_config r_cfg;
4189         struct rtnexthop *rtnh;
4190         struct rt6_info *rt;
4191         struct rt6_nh *err_nh;
4192         struct rt6_nh *nh, *nh_safe;
4193         __u16 nlflags;
4194         int remaining;
4195         int attrlen;
4196         int err = 1;
4197         int nhn = 0;
4198         int replace = (cfg->fc_nlinfo.nlh &&
4199                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4200         LIST_HEAD(rt6_nh_list);
4201
4202         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4203         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4204                 nlflags |= NLM_F_APPEND;
4205
4206         remaining = cfg->fc_mp_len;
4207         rtnh = (struct rtnexthop *)cfg->fc_mp;
4208
4209         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4210          * rt6_info structs per nexthop
4211          */
4212         while (rtnh_ok(rtnh, remaining)) {
4213                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4214                 if (rtnh->rtnh_ifindex)
4215                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4216
4217                 attrlen = rtnh_attrlen(rtnh);
4218                 if (attrlen > 0) {
4219                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4220
4221                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4222                         if (nla) {
4223                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4224                                 r_cfg.fc_flags |= RTF_GATEWAY;
4225                         }
4226                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4227                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4228                         if (nla)
4229                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4230                 }
4231
4232                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4233                 rt = ip6_route_info_create(&r_cfg, extack);
4234                 if (IS_ERR(rt)) {
4235                         err = PTR_ERR(rt);
4236                         rt = NULL;
4237                         goto cleanup;
4238                 }
4239
4240                 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4241
4242                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4243                 if (err) {
4244                         dst_release_immediate(&rt->dst);
4245                         goto cleanup;
4246                 }
4247
4248                 rtnh = rtnh_next(rtnh, &remaining);
4249         }
4250
4251         /* for add and replace send one notification with all nexthops.
4252          * Skip the notification in fib6_add_rt2node and send one with
4253          * the full route when done
4254          */
4255         info->skip_notify = 1;
4256
4257         err_nh = NULL;
4258         list_for_each_entry(nh, &rt6_nh_list, next) {
4259                 rt_last = nh->rt6_info;
4260                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4261                 /* save reference to first route for notification */
4262                 if (!rt_notif && !err)
4263                         rt_notif = nh->rt6_info;
4264
4265                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4266                 nh->rt6_info = NULL;
4267                 if (err) {
4268                         if (replace && nhn)
4269                                 ip6_print_replace_route_err(&rt6_nh_list);
4270                         err_nh = nh;
4271                         goto add_errout;
4272                 }
4273
4274                 /* Because each route is added like a single route we remove
4275                  * these flags after the first nexthop: if there is a collision,
4276                  * we have already failed to add the first nexthop:
4277                  * fib6_add_rt2node() has rejected it; when replacing, old
4278                  * nexthops have been replaced by first new, the rest should
4279                  * be added to it.
4280                  */
4281                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4282                                                      NLM_F_REPLACE);
4283                 nhn++;
4284         }
4285
4286         /* success ... tell user about new route */
4287         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4288         goto cleanup;
4289
4290 add_errout:
4291         /* send notification for routes that were added so that
4292          * the delete notifications sent by ip6_route_del are
4293          * coherent
4294          */
4295         if (rt_notif)
4296                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4297
4298         /* Delete routes that were already added */
4299         list_for_each_entry(nh, &rt6_nh_list, next) {
4300                 if (err_nh == nh)
4301                         break;
4302                 ip6_route_del(&nh->r_cfg, extack);
4303         }
4304
4305 cleanup:
4306         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4307                 if (nh->rt6_info)
4308                         dst_release_immediate(&nh->rt6_info->dst);
4309                 kfree(nh->mxc.mx);
4310                 list_del(&nh->next);
4311                 kfree(nh);
4312         }
4313
4314         return err;
4315 }
4316
4317 static int ip6_route_multipath_del(struct fib6_config *cfg,
4318                                    struct netlink_ext_ack *extack)
4319 {
4320         struct fib6_config r_cfg;
4321         struct rtnexthop *rtnh;
4322         int remaining;
4323         int attrlen;
4324         int err = 1, last_err = 0;
4325
4326         remaining = cfg->fc_mp_len;
4327         rtnh = (struct rtnexthop *)cfg->fc_mp;
4328
4329         /* Parse a Multipath Entry */
4330         while (rtnh_ok(rtnh, remaining)) {
4331                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4332                 if (rtnh->rtnh_ifindex)
4333                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4334
4335                 attrlen = rtnh_attrlen(rtnh);
4336                 if (attrlen > 0) {
4337                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4338
4339                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4340                         if (nla) {
4341                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4342                                 r_cfg.fc_flags |= RTF_GATEWAY;
4343                         }
4344                 }
4345                 err = ip6_route_del(&r_cfg, extack);
4346                 if (err)
4347                         last_err = err;
4348
4349                 rtnh = rtnh_next(rtnh, &remaining);
4350         }
4351
4352         return last_err;
4353 }
4354
4355 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4356                               struct netlink_ext_ack *extack)
4357 {
4358         struct fib6_config cfg;
4359         int err;
4360
4361         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4362         if (err < 0)
4363                 return err;
4364
4365         if (cfg.fc_mp)
4366                 return ip6_route_multipath_del(&cfg, extack);
4367         else {
4368                 cfg.fc_delete_all_nh = 1;
4369                 return ip6_route_del(&cfg, extack);
4370         }
4371 }
4372
4373 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4374                               struct netlink_ext_ack *extack)
4375 {
4376         struct fib6_config cfg;
4377         int err;
4378
4379         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4380         if (err < 0)
4381                 return err;
4382
4383         if (cfg.fc_mp)
4384                 return ip6_route_multipath_add(&cfg, extack);
4385         else
4386                 return ip6_route_add(&cfg, extack);
4387 }
4388
4389 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4390 {
4391         int nexthop_len = 0;
4392
4393         if (rt->rt6i_nsiblings) {
4394                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4395                             + NLA_ALIGN(sizeof(struct rtnexthop))
4396                             + nla_total_size(16) /* RTA_GATEWAY */
4397                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
4398
4399                 nexthop_len *= rt->rt6i_nsiblings;
4400         }
4401
4402         return NLMSG_ALIGN(sizeof(struct rtmsg))
4403                + nla_total_size(16) /* RTA_SRC */
4404                + nla_total_size(16) /* RTA_DST */
4405                + nla_total_size(16) /* RTA_GATEWAY */
4406                + nla_total_size(16) /* RTA_PREFSRC */
4407                + nla_total_size(4) /* RTA_TABLE */
4408                + nla_total_size(4) /* RTA_IIF */
4409                + nla_total_size(4) /* RTA_OIF */
4410                + nla_total_size(4) /* RTA_PRIORITY */
4411                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4412                + nla_total_size(sizeof(struct rta_cacheinfo))
4413                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4414                + nla_total_size(1) /* RTA_PREF */
4415                + lwtunnel_get_encap_size(rt->dst.lwtstate)
4416                + nexthop_len;
4417 }
4418
4419 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4420                             unsigned int *flags, bool skip_oif)
4421 {
4422         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4423                 *flags |= RTNH_F_DEAD;
4424
4425         if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4426                 *flags |= RTNH_F_LINKDOWN;
4427                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4428                         *flags |= RTNH_F_DEAD;
4429         }
4430
4431         if (rt->rt6i_flags & RTF_GATEWAY) {
4432                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4433                         goto nla_put_failure;
4434         }
4435
4436         *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4437         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4438                 *flags |= RTNH_F_OFFLOAD;
4439
4440         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4441         if (!skip_oif && rt->dst.dev &&
4442             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4443                 goto nla_put_failure;
4444
4445         if (rt->dst.lwtstate &&
4446             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4447                 goto nla_put_failure;
4448
4449         return 0;
4450
4451 nla_put_failure:
4452         return -EMSGSIZE;
4453 }
4454
4455 /* add multipath next hop */
4456 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4457 {
4458         struct rtnexthop *rtnh;
4459         unsigned int flags = 0;
4460
4461         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4462         if (!rtnh)
4463                 goto nla_put_failure;
4464
4465         rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4466         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4467
4468         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4469                 goto nla_put_failure;
4470
4471         rtnh->rtnh_flags = flags;
4472
4473         /* length of rtnetlink header + attributes */
4474         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4475
4476         return 0;
4477
4478 nla_put_failure:
4479         return -EMSGSIZE;
4480 }
4481
4482 static int rt6_fill_node(struct net *net,
4483                          struct sk_buff *skb, struct rt6_info *rt,
4484                          struct in6_addr *dst, struct in6_addr *src,
4485                          int iif, int type, u32 portid, u32 seq,
4486                          unsigned int flags)
4487 {
4488         u32 metrics[RTAX_MAX];
4489         struct rtmsg *rtm;
4490         struct nlmsghdr *nlh;
4491         long expires;
4492         u32 table;
4493
4494         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4495         if (!nlh)
4496                 return -EMSGSIZE;
4497
4498         rtm = nlmsg_data(nlh);
4499         rtm->rtm_family = AF_INET6;
4500         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4501         rtm->rtm_src_len = rt->rt6i_src.plen;
4502         rtm->rtm_tos = 0;
4503         if (rt->rt6i_table)
4504                 table = rt->rt6i_table->tb6_id;
4505         else
4506                 table = RT6_TABLE_UNSPEC;
4507         rtm->rtm_table = table;
4508         if (nla_put_u32(skb, RTA_TABLE, table))
4509                 goto nla_put_failure;
4510         if (rt->rt6i_flags & RTF_REJECT) {
4511                 switch (rt->dst.error) {
4512                 case -EINVAL:
4513                         rtm->rtm_type = RTN_BLACKHOLE;
4514                         break;
4515                 case -EACCES:
4516                         rtm->rtm_type = RTN_PROHIBIT;
4517                         break;
4518                 case -EAGAIN:
4519                         rtm->rtm_type = RTN_THROW;
4520                         break;
4521                 default:
4522                         rtm->rtm_type = RTN_UNREACHABLE;
4523                         break;
4524                 }
4525         }
4526         else if (rt->rt6i_flags & RTF_LOCAL)
4527                 rtm->rtm_type = RTN_LOCAL;
4528         else if (rt->rt6i_flags & RTF_ANYCAST)
4529                 rtm->rtm_type = RTN_ANYCAST;
4530         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4531                 rtm->rtm_type = RTN_LOCAL;
4532         else
4533                 rtm->rtm_type = RTN_UNICAST;
4534         rtm->rtm_flags = 0;
4535         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4536         rtm->rtm_protocol = rt->rt6i_protocol;
4537
4538         if (rt->rt6i_flags & RTF_CACHE)
4539                 rtm->rtm_flags |= RTM_F_CLONED;
4540
4541         if (dst) {
4542                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4543                         goto nla_put_failure;
4544                 rtm->rtm_dst_len = 128;
4545         } else if (rtm->rtm_dst_len)
4546                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4547                         goto nla_put_failure;
4548 #ifdef CONFIG_IPV6_SUBTREES
4549         if (src) {
4550                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4551                         goto nla_put_failure;
4552                 rtm->rtm_src_len = 128;
4553         } else if (rtm->rtm_src_len &&
4554                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4555                 goto nla_put_failure;
4556 #endif
4557         if (iif) {
4558 #ifdef CONFIG_IPV6_MROUTE
4559                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4560                         int err = ip6mr_get_route(net, skb, rtm, portid);
4561
4562                         if (err == 0)
4563                                 return 0;
4564                         if (err < 0)
4565                                 goto nla_put_failure;
4566                 } else
4567 #endif
4568                         if (nla_put_u32(skb, RTA_IIF, iif))
4569                                 goto nla_put_failure;
4570         } else if (dst) {
4571                 struct in6_addr saddr_buf;
4572                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4573                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4574                         goto nla_put_failure;
4575         }
4576
4577         if (rt->rt6i_prefsrc.plen) {
4578                 struct in6_addr saddr_buf;
4579                 saddr_buf = rt->rt6i_prefsrc.addr;
4580                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4581                         goto nla_put_failure;
4582         }
4583
4584         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4585         if (rt->rt6i_pmtu)
4586                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4587         if (rtnetlink_put_metrics(skb, metrics) < 0)
4588                 goto nla_put_failure;
4589
4590         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4591                 goto nla_put_failure;
4592
4593         /* For multipath routes, walk the siblings list and add
4594          * each as a nexthop within RTA_MULTIPATH.
4595          */
4596         if (rt->rt6i_nsiblings) {
4597                 struct rt6_info *sibling, *next_sibling;
4598                 struct nlattr *mp;
4599
4600                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4601                 if (!mp)
4602                         goto nla_put_failure;
4603
4604                 if (rt6_add_nexthop(skb, rt) < 0)
4605                         goto nla_put_failure;
4606
4607                 list_for_each_entry_safe(sibling, next_sibling,
4608                                          &rt->rt6i_siblings, rt6i_siblings) {
4609                         if (rt6_add_nexthop(skb, sibling) < 0)
4610                                 goto nla_put_failure;
4611                 }
4612
4613                 nla_nest_end(skb, mp);
4614         } else {
4615                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4616                         goto nla_put_failure;
4617         }
4618
4619         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4620
4621         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4622                 goto nla_put_failure;
4623
4624         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4625                 goto nla_put_failure;
4626
4627
4628         nlmsg_end(skb, nlh);
4629         return 0;
4630
4631 nla_put_failure:
4632         nlmsg_cancel(skb, nlh);
4633         return -EMSGSIZE;
4634 }
4635
4636 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4637 {
4638         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4639         struct net *net = arg->net;
4640
4641         if (rt == net->ipv6.ip6_null_entry)
4642                 return 0;
4643
4644         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4645                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4646
4647                 /* user wants prefix routes only */
4648                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4649                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4650                         /* success since this is not a prefix route */
4651                         return 1;
4652                 }
4653         }
4654
4655         return rt6_fill_node(net,
4656                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4657                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4658                      NLM_F_MULTI);
4659 }
4660
4661 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4662                               struct netlink_ext_ack *extack)
4663 {
4664         struct net *net = sock_net(in_skb->sk);
4665         struct nlattr *tb[RTA_MAX+1];
4666         int err, iif = 0, oif = 0;
4667         struct dst_entry *dst;
4668         struct rt6_info *rt;
4669         struct sk_buff *skb;
4670         struct rtmsg *rtm;
4671         struct flowi6 fl6;
4672         bool fibmatch;
4673
4674         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4675                           extack);
4676         if (err < 0)
4677                 goto errout;
4678
4679         err = -EINVAL;
4680         memset(&fl6, 0, sizeof(fl6));
4681         rtm = nlmsg_data(nlh);
4682         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4683         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4684
4685         if (tb[RTA_SRC]) {
4686                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4687                         goto errout;
4688
4689                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4690         }
4691
4692         if (tb[RTA_DST]) {
4693                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4694                         goto errout;
4695
4696                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4697         }
4698
4699         if (tb[RTA_IIF])
4700                 iif = nla_get_u32(tb[RTA_IIF]);
4701
4702         if (tb[RTA_OIF])
4703                 oif = nla_get_u32(tb[RTA_OIF]);
4704
4705         if (tb[RTA_MARK])
4706                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4707
4708         if (tb[RTA_UID])
4709                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4710                                            nla_get_u32(tb[RTA_UID]));
4711         else
4712                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4713
4714         if (iif) {
4715                 struct net_device *dev;
4716                 int flags = 0;
4717
4718                 rcu_read_lock();
4719
4720                 dev = dev_get_by_index_rcu(net, iif);
4721                 if (!dev) {
4722                         rcu_read_unlock();
4723                         err = -ENODEV;
4724                         goto errout;
4725                 }
4726
4727                 fl6.flowi6_iif = iif;
4728
4729                 if (!ipv6_addr_any(&fl6.saddr))
4730                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4731
4732                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4733
4734                 rcu_read_unlock();
4735         } else {
4736                 fl6.flowi6_oif = oif;
4737
4738                 dst = ip6_route_output(net, NULL, &fl6);
4739         }
4740
4741
4742         rt = container_of(dst, struct rt6_info, dst);
4743         if (rt->dst.error) {
4744                 err = rt->dst.error;
4745                 ip6_rt_put(rt);
4746                 goto errout;
4747         }
4748
4749         if (rt == net->ipv6.ip6_null_entry) {
4750                 err = rt->dst.error;
4751                 ip6_rt_put(rt);
4752                 goto errout;
4753         }
4754
4755         if (fibmatch && rt->from) {
4756                 struct rt6_info *ort = rt->from;
4757
4758                 dst_hold(&ort->dst);
4759                 ip6_rt_put(rt);
4760                 rt = ort;
4761         }
4762
4763         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4764         if (!skb) {
4765                 ip6_rt_put(rt);
4766                 err = -ENOBUFS;
4767                 goto errout;
4768         }
4769
4770         skb_dst_set(skb, &rt->dst);
4771         if (fibmatch)
4772                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4773                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4774                                     nlh->nlmsg_seq, 0);
4775         else
4776                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4777                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4778                                     nlh->nlmsg_seq, 0);
4779         if (err < 0) {
4780                 kfree_skb(skb);
4781                 goto errout;
4782         }
4783
4784         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4785 errout:
4786         return err;
4787 }
4788
4789 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4790                      unsigned int nlm_flags)
4791 {
4792         struct sk_buff *skb;
4793         struct net *net = info->nl_net;
4794         u32 seq;
4795         int err;
4796
4797         err = -ENOBUFS;
4798         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4799
4800         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4801         if (!skb)
4802                 goto errout;
4803
4804         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4805                                 event, info->portid, seq, nlm_flags);
4806         if (err < 0) {
4807                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4808                 WARN_ON(err == -EMSGSIZE);
4809                 kfree_skb(skb);
4810                 goto errout;
4811         }
4812         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4813                     info->nlh, gfp_any());
4814         return;
4815 errout:
4816         if (err < 0)
4817                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4818 }
4819
4820 static int ip6_route_dev_notify(struct notifier_block *this,
4821                                 unsigned long event, void *ptr)
4822 {
4823         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4824         struct net *net = dev_net(dev);
4825
4826         if (!(dev->flags & IFF_LOOPBACK))
4827                 return NOTIFY_OK;
4828
4829         if (event == NETDEV_REGISTER) {
4830                 net->ipv6.ip6_null_entry->dst.dev = dev;
4831                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4832 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4833                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4834                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4835                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4836                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4837 #endif
4838          } else if (event == NETDEV_UNREGISTER &&
4839                     dev->reg_state != NETREG_UNREGISTERED) {
4840                 /* NETDEV_UNREGISTER could be fired for multiple times by
4841                  * netdev_wait_allrefs(). Make sure we only call this once.
4842                  */
4843                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4844 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4845                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4846                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4847 #endif
4848         }
4849
4850         return NOTIFY_OK;
4851 }
4852
4853 /*
4854  *      /proc
4855  */
4856
4857 #ifdef CONFIG_PROC_FS
4858
4859 static const struct file_operations ipv6_route_proc_fops = {
4860         .open           = ipv6_route_open,
4861         .read           = seq_read,
4862         .llseek         = seq_lseek,
4863         .release        = seq_release_net,
4864 };
4865
4866 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4867 {
4868         struct net *net = (struct net *)seq->private;
4869         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4870                    net->ipv6.rt6_stats->fib_nodes,
4871                    net->ipv6.rt6_stats->fib_route_nodes,
4872                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4873                    net->ipv6.rt6_stats->fib_rt_entries,
4874                    net->ipv6.rt6_stats->fib_rt_cache,
4875                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4876                    net->ipv6.rt6_stats->fib_discarded_routes);
4877
4878         return 0;
4879 }
4880
4881 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4882 {
4883         return single_open_net(inode, file, rt6_stats_seq_show);
4884 }
4885
4886 static const struct file_operations rt6_stats_seq_fops = {
4887         .open    = rt6_stats_seq_open,
4888         .read    = seq_read,
4889         .llseek  = seq_lseek,
4890         .release = single_release_net,
4891 };
4892 #endif  /* CONFIG_PROC_FS */
4893
4894 #ifdef CONFIG_SYSCTL
4895
4896 static
4897 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4898                               void __user *buffer, size_t *lenp, loff_t *ppos)
4899 {
4900         struct net *net;
4901         int delay;
4902         if (!write)
4903                 return -EINVAL;
4904
4905         net = (struct net *)ctl->extra1;
4906         delay = net->ipv6.sysctl.flush_delay;
4907         proc_dointvec(ctl, write, buffer, lenp, ppos);
4908         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4909         return 0;
4910 }
4911
4912 struct ctl_table ipv6_route_table_template[] = {
4913         {
4914                 .procname       =       "flush",
4915                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4916                 .maxlen         =       sizeof(int),
4917                 .mode           =       0200,
4918                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4919         },
4920         {
4921                 .procname       =       "gc_thresh",
4922                 .data           =       &ip6_dst_ops_template.gc_thresh,
4923                 .maxlen         =       sizeof(int),
4924                 .mode           =       0644,
4925                 .proc_handler   =       proc_dointvec,
4926         },
4927         {
4928                 .procname       =       "max_size",
4929                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4930                 .maxlen         =       sizeof(int),
4931                 .mode           =       0644,
4932                 .proc_handler   =       proc_dointvec,
4933         },
4934         {
4935                 .procname       =       "gc_min_interval",
4936                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4937                 .maxlen         =       sizeof(int),
4938                 .mode           =       0644,
4939                 .proc_handler   =       proc_dointvec_jiffies,
4940         },
4941         {
4942                 .procname       =       "gc_timeout",
4943                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4944                 .maxlen         =       sizeof(int),
4945                 .mode           =       0644,
4946                 .proc_handler   =       proc_dointvec_jiffies,
4947         },
4948         {
4949                 .procname       =       "gc_interval",
4950                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4951                 .maxlen         =       sizeof(int),
4952                 .mode           =       0644,
4953                 .proc_handler   =       proc_dointvec_jiffies,
4954         },
4955         {
4956                 .procname       =       "gc_elasticity",
4957                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4958                 .maxlen         =       sizeof(int),
4959                 .mode           =       0644,
4960                 .proc_handler   =       proc_dointvec,
4961         },
4962         {
4963                 .procname       =       "mtu_expires",
4964                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4965                 .maxlen         =       sizeof(int),
4966                 .mode           =       0644,
4967                 .proc_handler   =       proc_dointvec_jiffies,
4968         },
4969         {
4970                 .procname       =       "min_adv_mss",
4971                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4972                 .maxlen         =       sizeof(int),
4973                 .mode           =       0644,
4974                 .proc_handler   =       proc_dointvec,
4975         },
4976         {
4977                 .procname       =       "gc_min_interval_ms",
4978                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4979                 .maxlen         =       sizeof(int),
4980                 .mode           =       0644,
4981                 .proc_handler   =       proc_dointvec_ms_jiffies,
4982         },
4983         { }
4984 };
4985
4986 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4987 {
4988         struct ctl_table *table;
4989
4990         table = kmemdup(ipv6_route_table_template,
4991                         sizeof(ipv6_route_table_template),
4992                         GFP_KERNEL);
4993
4994         if (table) {
4995                 table[0].data = &net->ipv6.sysctl.flush_delay;
4996                 table[0].extra1 = net;
4997                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4998                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4999                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5000                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5001                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5002                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5003                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5004                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5005                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5006
5007                 /* Don't export sysctls to unprivileged users */
5008                 if (net->user_ns != &init_user_ns)
5009                         table[0].procname = NULL;
5010         }
5011
5012         return table;
5013 }
5014 #endif
5015
5016 static int __net_init ip6_route_net_init(struct net *net)
5017 {
5018         int ret = -ENOMEM;
5019
5020         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5021                sizeof(net->ipv6.ip6_dst_ops));
5022
5023         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5024                 goto out_ip6_dst_ops;
5025
5026         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5027                                            sizeof(*net->ipv6.ip6_null_entry),
5028                                            GFP_KERNEL);
5029         if (!net->ipv6.ip6_null_entry)
5030                 goto out_ip6_dst_entries;
5031         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5032         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5033                          ip6_template_metrics, true);
5034
5035 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5036         net->ipv6.fib6_has_custom_rules = false;
5037         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5038                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5039                                                GFP_KERNEL);
5040         if (!net->ipv6.ip6_prohibit_entry)
5041                 goto out_ip6_null_entry;
5042         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5043         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5044                          ip6_template_metrics, true);
5045
5046         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5047                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5048                                                GFP_KERNEL);
5049         if (!net->ipv6.ip6_blk_hole_entry)
5050                 goto out_ip6_prohibit_entry;
5051         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5052         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5053                          ip6_template_metrics, true);
5054 #endif
5055
5056         net->ipv6.sysctl.flush_delay = 0;
5057         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5058         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5059         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5060         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5061         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5062         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5063         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5064
5065         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5066
5067         ret = 0;
5068 out:
5069         return ret;
5070
5071 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5072 out_ip6_prohibit_entry:
5073         kfree(net->ipv6.ip6_prohibit_entry);
5074 out_ip6_null_entry:
5075         kfree(net->ipv6.ip6_null_entry);
5076 #endif
5077 out_ip6_dst_entries:
5078         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5079 out_ip6_dst_ops:
5080         goto out;
5081 }
5082
5083 static void __net_exit ip6_route_net_exit(struct net *net)
5084 {
5085         kfree(net->ipv6.ip6_null_entry);
5086 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5087         kfree(net->ipv6.ip6_prohibit_entry);
5088         kfree(net->ipv6.ip6_blk_hole_entry);
5089 #endif
5090         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5091 }
5092
5093 static int __net_init ip6_route_net_init_late(struct net *net)
5094 {
5095 #ifdef CONFIG_PROC_FS
5096         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5097         proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5098 #endif
5099         return 0;
5100 }
5101
5102 static void __net_exit ip6_route_net_exit_late(struct net *net)
5103 {
5104 #ifdef CONFIG_PROC_FS
5105         remove_proc_entry("ipv6_route", net->proc_net);
5106         remove_proc_entry("rt6_stats", net->proc_net);
5107 #endif
5108 }
5109
5110 static struct pernet_operations ip6_route_net_ops = {
5111         .init = ip6_route_net_init,
5112         .exit = ip6_route_net_exit,
5113 };
5114
5115 static int __net_init ipv6_inetpeer_init(struct net *net)
5116 {
5117         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5118
5119         if (!bp)
5120                 return -ENOMEM;
5121         inet_peer_base_init(bp);
5122         net->ipv6.peers = bp;
5123         return 0;
5124 }
5125
5126 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5127 {
5128         struct inet_peer_base *bp = net->ipv6.peers;
5129
5130         net->ipv6.peers = NULL;
5131         inetpeer_invalidate_tree(bp);
5132         kfree(bp);
5133 }
5134
5135 static struct pernet_operations ipv6_inetpeer_ops = {
5136         .init   =       ipv6_inetpeer_init,
5137         .exit   =       ipv6_inetpeer_exit,
5138 };
5139
5140 static struct pernet_operations ip6_route_net_late_ops = {
5141         .init = ip6_route_net_init_late,
5142         .exit = ip6_route_net_exit_late,
5143 };
5144
5145 static struct notifier_block ip6_route_dev_notifier = {
5146         .notifier_call = ip6_route_dev_notify,
5147         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5148 };
5149
5150 void __init ip6_route_init_special_entries(void)
5151 {
5152         /* Registering of the loopback is done before this portion of code,
5153          * the loopback reference in rt6_info will not be taken, do it
5154          * manually for init_net */
5155         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5156         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5157   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5158         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5159         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5160         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5161         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5162   #endif
5163 }
5164
5165 int __init ip6_route_init(void)
5166 {
5167         int ret;
5168         int cpu;
5169
5170         ret = -ENOMEM;
5171         ip6_dst_ops_template.kmem_cachep =
5172                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5173                                   SLAB_HWCACHE_ALIGN, NULL);
5174         if (!ip6_dst_ops_template.kmem_cachep)
5175                 goto out;
5176
5177         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5178         if (ret)
5179                 goto out_kmem_cache;
5180
5181         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5182         if (ret)
5183                 goto out_dst_entries;
5184
5185         ret = register_pernet_subsys(&ip6_route_net_ops);
5186         if (ret)
5187                 goto out_register_inetpeer;
5188
5189         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5190
5191         ret = fib6_init();
5192         if (ret)
5193                 goto out_register_subsys;
5194
5195         ret = xfrm6_init();
5196         if (ret)
5197                 goto out_fib6_init;
5198
5199         ret = fib6_rules_init();
5200         if (ret)
5201                 goto xfrm6_init;
5202
5203         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5204         if (ret)
5205                 goto fib6_rules_init;
5206
5207         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5208                                    inet6_rtm_newroute, NULL, 0);
5209         if (ret < 0)
5210                 goto out_register_late_subsys;
5211
5212         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5213                                    inet6_rtm_delroute, NULL, 0);
5214         if (ret < 0)
5215                 goto out_register_late_subsys;
5216
5217         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5218                                    inet6_rtm_getroute, NULL,
5219                                    RTNL_FLAG_DOIT_UNLOCKED);
5220         if (ret < 0)
5221                 goto out_register_late_subsys;
5222
5223         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5224         if (ret)
5225                 goto out_register_late_subsys;
5226
5227         for_each_possible_cpu(cpu) {
5228                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5229
5230                 INIT_LIST_HEAD(&ul->head);
5231                 spin_lock_init(&ul->lock);
5232         }
5233
5234 out:
5235         return ret;
5236
5237 out_register_late_subsys:
5238         rtnl_unregister_all(PF_INET6);
5239         unregister_pernet_subsys(&ip6_route_net_late_ops);
5240 fib6_rules_init:
5241         fib6_rules_cleanup();
5242 xfrm6_init:
5243         xfrm6_fini();
5244 out_fib6_init:
5245         fib6_gc_cleanup();
5246 out_register_subsys:
5247         unregister_pernet_subsys(&ip6_route_net_ops);
5248 out_register_inetpeer:
5249         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5250 out_dst_entries:
5251         dst_entries_destroy(&ip6_dst_blackhole_ops);
5252 out_kmem_cache:
5253         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5254         goto out;
5255 }
5256
5257 void ip6_route_cleanup(void)
5258 {
5259         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5260         unregister_pernet_subsys(&ip6_route_net_late_ops);
5261         fib6_rules_cleanup();
5262         xfrm6_fini();
5263         fib6_gc_cleanup();
5264         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5265         unregister_pernet_subsys(&ip6_route_net_ops);
5266         dst_entries_destroy(&ip6_dst_blackhole_ops);
5267         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5268 }
This page took 0.328461 seconds and 4 git commands to generate.