]> Git Repo - linux.git/blob - net/ipv6/route.c
net/ipv6: Add support for path selection using hash of 5-tuple
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <[email protected]>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454                                              struct rt6_info *match,
455                                              struct flowi6 *fl6, int oif,
456                                              const struct sk_buff *skb,
457                                              int strict)
458 {
459         struct rt6_info *sibling, *next_sibling;
460
461         /* We might have already computed the hash for ICMPv6 errors. In such
462          * case it will always be non-zero. Otherwise now is the time to do it.
463          */
464         if (!fl6->mp_hash)
465                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466
467         if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468                 return match;
469
470         list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471                                  rt6i_siblings) {
472                 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473                         continue;
474                 if (rt6_score_route(sibling, oif, strict) < 0)
475                         break;
476                 match = sibling;
477                 break;
478         }
479
480         return match;
481 }
482
483 /*
484  *      Route lookup. rcu_read_lock() should be held.
485  */
486
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488                                                     struct rt6_info *rt,
489                                                     const struct in6_addr *saddr,
490                                                     int oif,
491                                                     int flags)
492 {
493         struct rt6_info *local = NULL;
494         struct rt6_info *sprt;
495
496         if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497                 return rt;
498
499         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500                 struct net_device *dev = sprt->dst.dev;
501
502                 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503                         continue;
504
505                 if (oif) {
506                         if (dev->ifindex == oif)
507                                 return sprt;
508                         if (dev->flags & IFF_LOOPBACK) {
509                                 if (!sprt->rt6i_idev ||
510                                     sprt->rt6i_idev->dev->ifindex != oif) {
511                                         if (flags & RT6_LOOKUP_F_IFACE)
512                                                 continue;
513                                         if (local &&
514                                             local->rt6i_idev->dev->ifindex == oif)
515                                                 continue;
516                                 }
517                                 local = sprt;
518                         }
519                 } else {
520                         if (ipv6_chk_addr(net, saddr, dev,
521                                           flags & RT6_LOOKUP_F_IFACE))
522                                 return sprt;
523                 }
524         }
525
526         if (oif) {
527                 if (local)
528                         return local;
529
530                 if (flags & RT6_LOOKUP_F_IFACE)
531                         return net->ipv6.ip6_null_entry;
532         }
533
534         return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 }
536
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539         struct work_struct work;
540         struct in6_addr target;
541         struct net_device *dev;
542 };
543
544 static void rt6_probe_deferred(struct work_struct *w)
545 {
546         struct in6_addr mcaddr;
547         struct __rt6_probe_work *work =
548                 container_of(w, struct __rt6_probe_work, work);
549
550         addrconf_addr_solict_mult(&work->target, &mcaddr);
551         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552         dev_put(work->dev);
553         kfree(work);
554 }
555
556 static void rt6_probe(struct rt6_info *rt)
557 {
558         struct __rt6_probe_work *work;
559         struct neighbour *neigh;
560         /*
561          * Okay, this does not seem to be appropriate
562          * for now, however, we need to check if it
563          * is really so; aka Router Reachability Probing.
564          *
565          * Router Reachability Probe MUST be rate-limited
566          * to no more than one per minute.
567          */
568         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569                 return;
570         rcu_read_lock_bh();
571         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572         if (neigh) {
573                 if (neigh->nud_state & NUD_VALID)
574                         goto out;
575
576                 work = NULL;
577                 write_lock(&neigh->lock);
578                 if (!(neigh->nud_state & NUD_VALID) &&
579                     time_after(jiffies,
580                                neigh->updated +
581                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
582                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
583                         if (work)
584                                 __neigh_set_probe_once(neigh);
585                 }
586                 write_unlock(&neigh->lock);
587         } else {
588                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
589         }
590
591         if (work) {
592                 INIT_WORK(&work->work, rt6_probe_deferred);
593                 work->target = rt->rt6i_gateway;
594                 dev_hold(rt->dst.dev);
595                 work->dev = rt->dst.dev;
596                 schedule_work(&work->work);
597         }
598
599 out:
600         rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct rt6_info *rt)
604 {
605 }
606 #endif
607
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
612 {
613         struct net_device *dev = rt->dst.dev;
614         if (!oif || dev->ifindex == oif)
615                 return 2;
616         if ((dev->flags & IFF_LOOPBACK) &&
617             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618                 return 1;
619         return 0;
620 }
621
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
623 {
624         struct neighbour *neigh;
625         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
626
627         if (rt->rt6i_flags & RTF_NONEXTHOP ||
628             !(rt->rt6i_flags & RTF_GATEWAY))
629                 return RT6_NUD_SUCCEED;
630
631         rcu_read_lock_bh();
632         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633         if (neigh) {
634                 read_lock(&neigh->lock);
635                 if (neigh->nud_state & NUD_VALID)
636                         ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638                 else if (!(neigh->nud_state & NUD_FAILED))
639                         ret = RT6_NUD_SUCCEED;
640                 else
641                         ret = RT6_NUD_FAIL_PROBE;
642 #endif
643                 read_unlock(&neigh->lock);
644         } else {
645                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
647         }
648         rcu_read_unlock_bh();
649
650         return ret;
651 }
652
653 static int rt6_score_route(struct rt6_info *rt, int oif,
654                            int strict)
655 {
656         int m;
657
658         m = rt6_check_dev(rt, oif);
659         if (!m && (strict & RT6_LOOKUP_F_IFACE))
660                 return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663 #endif
664         if (strict & RT6_LOOKUP_F_REACHABLE) {
665                 int n = rt6_check_neigh(rt);
666                 if (n < 0)
667                         return n;
668         }
669         return m;
670 }
671
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673                                    int *mpri, struct rt6_info *match,
674                                    bool *do_rr)
675 {
676         int m;
677         bool match_do_rr = false;
678         struct inet6_dev *idev = rt->rt6i_idev;
679
680         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681                 goto out;
682
683         if (idev->cnf.ignore_routes_with_linkdown &&
684             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686                 goto out;
687
688         if (rt6_check_expired(rt))
689                 goto out;
690
691         m = rt6_score_route(rt, oif, strict);
692         if (m == RT6_NUD_FAIL_DO_RR) {
693                 match_do_rr = true;
694                 m = 0; /* lowest valid score */
695         } else if (m == RT6_NUD_FAIL_HARD) {
696                 goto out;
697         }
698
699         if (strict & RT6_LOOKUP_F_REACHABLE)
700                 rt6_probe(rt);
701
702         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
703         if (m > *mpri) {
704                 *do_rr = match_do_rr;
705                 *mpri = m;
706                 match = rt;
707         }
708 out:
709         return match;
710 }
711
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713                                      struct rt6_info *leaf,
714                                      struct rt6_info *rr_head,
715                                      u32 metric, int oif, int strict,
716                                      bool *do_rr)
717 {
718         struct rt6_info *rt, *match, *cont;
719         int mpri = -1;
720
721         match = NULL;
722         cont = NULL;
723         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724                 if (rt->rt6i_metric != metric) {
725                         cont = rt;
726                         break;
727                 }
728
729                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730         }
731
732         for (rt = leaf; rt && rt != rr_head;
733              rt = rcu_dereference(rt->rt6_next)) {
734                 if (rt->rt6i_metric != metric) {
735                         cont = rt;
736                         break;
737                 }
738
739                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
740         }
741
742         if (match || !cont)
743                 return match;
744
745         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
747
748         return match;
749 }
750
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752                                    int oif, int strict)
753 {
754         struct rt6_info *leaf = rcu_dereference(fn->leaf);
755         struct rt6_info *match, *rt0;
756         bool do_rr = false;
757         int key_plen;
758
759         if (!leaf || leaf == net->ipv6.ip6_null_entry)
760                 return net->ipv6.ip6_null_entry;
761
762         rt0 = rcu_dereference(fn->rr_ptr);
763         if (!rt0)
764                 rt0 = leaf;
765
766         /* Double check to make sure fn is not an intermediate node
767          * and fn->leaf does not points to its child's leaf
768          * (This might happen if all routes under fn are deleted from
769          * the tree and fib6_repair_tree() is called on the node.)
770          */
771         key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773         if (rt0->rt6i_src.plen)
774                 key_plen = rt0->rt6i_src.plen;
775 #endif
776         if (fn->fn_bit != key_plen)
777                 return net->ipv6.ip6_null_entry;
778
779         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780                              &do_rr);
781
782         if (do_rr) {
783                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784
785                 /* no entries matched; do round-robin */
786                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
787                         next = leaf;
788
789                 if (next != rt0) {
790                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791                         /* make sure next is not being deleted from the tree */
792                         if (next->rt6i_node)
793                                 rcu_assign_pointer(fn->rr_ptr, next);
794                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795                 }
796         }
797
798         return match ? match : net->ipv6.ip6_null_entry;
799 }
800
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808                   const struct in6_addr *gwaddr)
809 {
810         struct net *net = dev_net(dev);
811         struct route_info *rinfo = (struct route_info *) opt;
812         struct in6_addr prefix_buf, *prefix;
813         unsigned int pref;
814         unsigned long lifetime;
815         struct rt6_info *rt;
816
817         if (len < sizeof(struct route_info)) {
818                 return -EINVAL;
819         }
820
821         /* Sanity check for prefix_len and length */
822         if (rinfo->length > 3) {
823                 return -EINVAL;
824         } else if (rinfo->prefix_len > 128) {
825                 return -EINVAL;
826         } else if (rinfo->prefix_len > 64) {
827                 if (rinfo->length < 2) {
828                         return -EINVAL;
829                 }
830         } else if (rinfo->prefix_len > 0) {
831                 if (rinfo->length < 1) {
832                         return -EINVAL;
833                 }
834         }
835
836         pref = rinfo->route_pref;
837         if (pref == ICMPV6_ROUTER_PREF_INVALID)
838                 return -EINVAL;
839
840         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841
842         if (rinfo->length == 3)
843                 prefix = (struct in6_addr *)rinfo->prefix;
844         else {
845                 /* this function is safe */
846                 ipv6_addr_prefix(&prefix_buf,
847                                  (struct in6_addr *)rinfo->prefix,
848                                  rinfo->prefix_len);
849                 prefix = &prefix_buf;
850         }
851
852         if (rinfo->prefix_len == 0)
853                 rt = rt6_get_dflt_router(gwaddr, dev);
854         else
855                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856                                         gwaddr, dev);
857
858         if (rt && !lifetime) {
859                 ip6_del_rt(rt);
860                 rt = NULL;
861         }
862
863         if (!rt && lifetime)
864                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865                                         dev, pref);
866         else if (rt)
867                 rt->rt6i_flags = RTF_ROUTEINFO |
868                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869
870         if (rt) {
871                 if (!addrconf_finite_timeout(lifetime))
872                         rt6_clean_expires(rt);
873                 else
874                         rt6_set_expires(rt, jiffies + HZ * lifetime);
875
876                 ip6_rt_put(rt);
877         }
878         return 0;
879 }
880 #endif
881
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883                                         struct in6_addr *saddr)
884 {
885         struct fib6_node *pn, *sn;
886         while (1) {
887                 if (fn->fn_flags & RTN_TL_ROOT)
888                         return NULL;
889                 pn = rcu_dereference(fn->parent);
890                 sn = FIB6_SUBTREE(pn);
891                 if (sn && sn != fn)
892                         fn = fib6_lookup(sn, NULL, saddr);
893                 else
894                         fn = pn;
895                 if (fn->fn_flags & RTN_RTINFO)
896                         return fn;
897         }
898 }
899
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901                           bool null_fallback)
902 {
903         struct rt6_info *rt = *prt;
904
905         if (dst_hold_safe(&rt->dst))
906                 return true;
907         if (null_fallback) {
908                 rt = net->ipv6.ip6_null_entry;
909                 dst_hold(&rt->dst);
910         } else {
911                 rt = NULL;
912         }
913         *prt = rt;
914         return false;
915 }
916
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918                                              struct fib6_table *table,
919                                              struct flowi6 *fl6,
920                                              const struct sk_buff *skb,
921                                              int flags)
922 {
923         struct rt6_info *rt, *rt_cache;
924         struct fib6_node *fn;
925
926         rcu_read_lock();
927         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
928 restart:
929         rt = rcu_dereference(fn->leaf);
930         if (!rt) {
931                 rt = net->ipv6.ip6_null_entry;
932         } else {
933                 rt = rt6_device_match(net, rt, &fl6->saddr,
934                                       fl6->flowi6_oif, flags);
935                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
936                         rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
937                                                   skb, flags);
938         }
939         if (rt == net->ipv6.ip6_null_entry) {
940                 fn = fib6_backtrack(fn, &fl6->saddr);
941                 if (fn)
942                         goto restart;
943         }
944         /* Search through exception table */
945         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
946         if (rt_cache)
947                 rt = rt_cache;
948
949         if (ip6_hold_safe(net, &rt, true))
950                 dst_use_noref(&rt->dst, jiffies);
951
952         rcu_read_unlock();
953
954         trace_fib6_table_lookup(net, rt, table, fl6);
955
956         return rt;
957
958 }
959
960 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
961                                    const struct sk_buff *skb, int flags)
962 {
963         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
964 }
965 EXPORT_SYMBOL_GPL(ip6_route_lookup);
966
967 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
968                             const struct in6_addr *saddr, int oif,
969                             const struct sk_buff *skb, int strict)
970 {
971         struct flowi6 fl6 = {
972                 .flowi6_oif = oif,
973                 .daddr = *daddr,
974         };
975         struct dst_entry *dst;
976         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
977
978         if (saddr) {
979                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
980                 flags |= RT6_LOOKUP_F_HAS_SADDR;
981         }
982
983         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
984         if (dst->error == 0)
985                 return (struct rt6_info *) dst;
986
987         dst_release(dst);
988
989         return NULL;
990 }
991 EXPORT_SYMBOL(rt6_lookup);
992
993 /* ip6_ins_rt is called with FREE table->tb6_lock.
994  * It takes new route entry, the addition fails by any reason the
995  * route is released.
996  * Caller must hold dst before calling it.
997  */
998
999 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1000                         struct mx6_config *mxc,
1001                         struct netlink_ext_ack *extack)
1002 {
1003         int err;
1004         struct fib6_table *table;
1005
1006         table = rt->rt6i_table;
1007         spin_lock_bh(&table->tb6_lock);
1008         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1009         spin_unlock_bh(&table->tb6_lock);
1010
1011         return err;
1012 }
1013
1014 int ip6_ins_rt(struct rt6_info *rt)
1015 {
1016         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1017         struct mx6_config mxc = { .mx = NULL, };
1018
1019         /* Hold dst to account for the reference from the fib6 tree */
1020         dst_hold(&rt->dst);
1021         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1022 }
1023
1024 /* called with rcu_lock held */
1025 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1026 {
1027         struct net_device *dev = rt->dst.dev;
1028
1029         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1030                 /* for copies of local routes, dst->dev needs to be the
1031                  * device if it is a master device, the master device if
1032                  * device is enslaved, and the loopback as the default
1033                  */
1034                 if (netif_is_l3_slave(dev) &&
1035                     !rt6_need_strict(&rt->rt6i_dst.addr))
1036                         dev = l3mdev_master_dev_rcu(dev);
1037                 else if (!netif_is_l3_master(dev))
1038                         dev = dev_net(dev)->loopback_dev;
1039                 /* last case is netif_is_l3_master(dev) is true in which
1040                  * case we want dev returned to be dev
1041                  */
1042         }
1043
1044         return dev;
1045 }
1046
1047 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1048                                            const struct in6_addr *daddr,
1049                                            const struct in6_addr *saddr)
1050 {
1051         struct net_device *dev;
1052         struct rt6_info *rt;
1053
1054         /*
1055          *      Clone the route.
1056          */
1057
1058         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1059                 ort = ort->from;
1060
1061         rcu_read_lock();
1062         dev = ip6_rt_get_dev_rcu(ort);
1063         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1064         rcu_read_unlock();
1065         if (!rt)
1066                 return NULL;
1067
1068         ip6_rt_copy_init(rt, ort);
1069         rt->rt6i_flags |= RTF_CACHE;
1070         rt->rt6i_metric = 0;
1071         rt->dst.flags |= DST_HOST;
1072         rt->rt6i_dst.addr = *daddr;
1073         rt->rt6i_dst.plen = 128;
1074
1075         if (!rt6_is_gw_or_nonexthop(ort)) {
1076                 if (ort->rt6i_dst.plen != 128 &&
1077                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1078                         rt->rt6i_flags |= RTF_ANYCAST;
1079 #ifdef CONFIG_IPV6_SUBTREES
1080                 if (rt->rt6i_src.plen && saddr) {
1081                         rt->rt6i_src.addr = *saddr;
1082                         rt->rt6i_src.plen = 128;
1083                 }
1084 #endif
1085         }
1086
1087         return rt;
1088 }
1089
1090 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1091 {
1092         struct net_device *dev;
1093         struct rt6_info *pcpu_rt;
1094
1095         rcu_read_lock();
1096         dev = ip6_rt_get_dev_rcu(rt);
1097         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1098         rcu_read_unlock();
1099         if (!pcpu_rt)
1100                 return NULL;
1101         ip6_rt_copy_init(pcpu_rt, rt);
1102         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1103         pcpu_rt->rt6i_flags |= RTF_PCPU;
1104         return pcpu_rt;
1105 }
1106
1107 /* It should be called with rcu_read_lock() acquired */
1108 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1109 {
1110         struct rt6_info *pcpu_rt, **p;
1111
1112         p = this_cpu_ptr(rt->rt6i_pcpu);
1113         pcpu_rt = *p;
1114
1115         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1116                 rt6_dst_from_metrics_check(pcpu_rt);
1117
1118         return pcpu_rt;
1119 }
1120
1121 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1122 {
1123         struct rt6_info *pcpu_rt, *prev, **p;
1124
1125         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1126         if (!pcpu_rt) {
1127                 struct net *net = dev_net(rt->dst.dev);
1128
1129                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1130                 return net->ipv6.ip6_null_entry;
1131         }
1132
1133         dst_hold(&pcpu_rt->dst);
1134         p = this_cpu_ptr(rt->rt6i_pcpu);
1135         prev = cmpxchg(p, NULL, pcpu_rt);
1136         BUG_ON(prev);
1137
1138         rt6_dst_from_metrics_check(pcpu_rt);
1139         return pcpu_rt;
1140 }
1141
1142 /* exception hash table implementation
1143  */
1144 static DEFINE_SPINLOCK(rt6_exception_lock);
1145
1146 /* Remove rt6_ex from hash table and free the memory
1147  * Caller must hold rt6_exception_lock
1148  */
1149 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1150                                  struct rt6_exception *rt6_ex)
1151 {
1152         struct net *net;
1153
1154         if (!bucket || !rt6_ex)
1155                 return;
1156
1157         net = dev_net(rt6_ex->rt6i->dst.dev);
1158         rt6_ex->rt6i->rt6i_node = NULL;
1159         hlist_del_rcu(&rt6_ex->hlist);
1160         rt6_release(rt6_ex->rt6i);
1161         kfree_rcu(rt6_ex, rcu);
1162         WARN_ON_ONCE(!bucket->depth);
1163         bucket->depth--;
1164         net->ipv6.rt6_stats->fib_rt_cache--;
1165 }
1166
1167 /* Remove oldest rt6_ex in bucket and free the memory
1168  * Caller must hold rt6_exception_lock
1169  */
1170 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1171 {
1172         struct rt6_exception *rt6_ex, *oldest = NULL;
1173
1174         if (!bucket)
1175                 return;
1176
1177         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1178                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1179                         oldest = rt6_ex;
1180         }
1181         rt6_remove_exception(bucket, oldest);
1182 }
1183
1184 static u32 rt6_exception_hash(const struct in6_addr *dst,
1185                               const struct in6_addr *src)
1186 {
1187         static u32 seed __read_mostly;
1188         u32 val;
1189
1190         net_get_random_once(&seed, sizeof(seed));
1191         val = jhash(dst, sizeof(*dst), seed);
1192
1193 #ifdef CONFIG_IPV6_SUBTREES
1194         if (src)
1195                 val = jhash(src, sizeof(*src), val);
1196 #endif
1197         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1198 }
1199
1200 /* Helper function to find the cached rt in the hash table
1201  * and update bucket pointer to point to the bucket for this
1202  * (daddr, saddr) pair
1203  * Caller must hold rt6_exception_lock
1204  */
1205 static struct rt6_exception *
1206 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1207                               const struct in6_addr *daddr,
1208                               const struct in6_addr *saddr)
1209 {
1210         struct rt6_exception *rt6_ex;
1211         u32 hval;
1212
1213         if (!(*bucket) || !daddr)
1214                 return NULL;
1215
1216         hval = rt6_exception_hash(daddr, saddr);
1217         *bucket += hval;
1218
1219         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1220                 struct rt6_info *rt6 = rt6_ex->rt6i;
1221                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1222
1223 #ifdef CONFIG_IPV6_SUBTREES
1224                 if (matched && saddr)
1225                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1226 #endif
1227                 if (matched)
1228                         return rt6_ex;
1229         }
1230         return NULL;
1231 }
1232
1233 /* Helper function to find the cached rt in the hash table
1234  * and update bucket pointer to point to the bucket for this
1235  * (daddr, saddr) pair
1236  * Caller must hold rcu_read_lock()
1237  */
1238 static struct rt6_exception *
1239 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1240                          const struct in6_addr *daddr,
1241                          const struct in6_addr *saddr)
1242 {
1243         struct rt6_exception *rt6_ex;
1244         u32 hval;
1245
1246         WARN_ON_ONCE(!rcu_read_lock_held());
1247
1248         if (!(*bucket) || !daddr)
1249                 return NULL;
1250
1251         hval = rt6_exception_hash(daddr, saddr);
1252         *bucket += hval;
1253
1254         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1255                 struct rt6_info *rt6 = rt6_ex->rt6i;
1256                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1257
1258 #ifdef CONFIG_IPV6_SUBTREES
1259                 if (matched && saddr)
1260                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1261 #endif
1262                 if (matched)
1263                         return rt6_ex;
1264         }
1265         return NULL;
1266 }
1267
1268 static int rt6_insert_exception(struct rt6_info *nrt,
1269                                 struct rt6_info *ort)
1270 {
1271         struct net *net = dev_net(ort->dst.dev);
1272         struct rt6_exception_bucket *bucket;
1273         struct in6_addr *src_key = NULL;
1274         struct rt6_exception *rt6_ex;
1275         int err = 0;
1276
1277         /* ort can't be a cache or pcpu route */
1278         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1279                 ort = ort->from;
1280         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1281
1282         spin_lock_bh(&rt6_exception_lock);
1283
1284         if (ort->exception_bucket_flushed) {
1285                 err = -EINVAL;
1286                 goto out;
1287         }
1288
1289         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1290                                         lockdep_is_held(&rt6_exception_lock));
1291         if (!bucket) {
1292                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1293                                  GFP_ATOMIC);
1294                 if (!bucket) {
1295                         err = -ENOMEM;
1296                         goto out;
1297                 }
1298                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1299         }
1300
1301 #ifdef CONFIG_IPV6_SUBTREES
1302         /* rt6i_src.plen != 0 indicates ort is in subtree
1303          * and exception table is indexed by a hash of
1304          * both rt6i_dst and rt6i_src.
1305          * Otherwise, the exception table is indexed by
1306          * a hash of only rt6i_dst.
1307          */
1308         if (ort->rt6i_src.plen)
1309                 src_key = &nrt->rt6i_src.addr;
1310 #endif
1311
1312         /* Update rt6i_prefsrc as it could be changed
1313          * in rt6_remove_prefsrc()
1314          */
1315         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1316         /* rt6_mtu_change() might lower mtu on ort.
1317          * Only insert this exception route if its mtu
1318          * is less than ort's mtu value.
1319          */
1320         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1321                 err = -EINVAL;
1322                 goto out;
1323         }
1324
1325         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1326                                                src_key);
1327         if (rt6_ex)
1328                 rt6_remove_exception(bucket, rt6_ex);
1329
1330         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1331         if (!rt6_ex) {
1332                 err = -ENOMEM;
1333                 goto out;
1334         }
1335         rt6_ex->rt6i = nrt;
1336         rt6_ex->stamp = jiffies;
1337         atomic_inc(&nrt->rt6i_ref);
1338         nrt->rt6i_node = ort->rt6i_node;
1339         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1340         bucket->depth++;
1341         net->ipv6.rt6_stats->fib_rt_cache++;
1342
1343         if (bucket->depth > FIB6_MAX_DEPTH)
1344                 rt6_exception_remove_oldest(bucket);
1345
1346 out:
1347         spin_unlock_bh(&rt6_exception_lock);
1348
1349         /* Update fn->fn_sernum to invalidate all cached dst */
1350         if (!err) {
1351                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1352                 fib6_update_sernum(ort);
1353                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1354                 fib6_force_start_gc(net);
1355         }
1356
1357         return err;
1358 }
1359
1360 void rt6_flush_exceptions(struct rt6_info *rt)
1361 {
1362         struct rt6_exception_bucket *bucket;
1363         struct rt6_exception *rt6_ex;
1364         struct hlist_node *tmp;
1365         int i;
1366
1367         spin_lock_bh(&rt6_exception_lock);
1368         /* Prevent rt6_insert_exception() to recreate the bucket list */
1369         rt->exception_bucket_flushed = 1;
1370
1371         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1372                                     lockdep_is_held(&rt6_exception_lock));
1373         if (!bucket)
1374                 goto out;
1375
1376         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1377                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1378                         rt6_remove_exception(bucket, rt6_ex);
1379                 WARN_ON_ONCE(bucket->depth);
1380                 bucket++;
1381         }
1382
1383 out:
1384         spin_unlock_bh(&rt6_exception_lock);
1385 }
1386
1387 /* Find cached rt in the hash table inside passed in rt
1388  * Caller has to hold rcu_read_lock()
1389  */
1390 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1391                                            struct in6_addr *daddr,
1392                                            struct in6_addr *saddr)
1393 {
1394         struct rt6_exception_bucket *bucket;
1395         struct in6_addr *src_key = NULL;
1396         struct rt6_exception *rt6_ex;
1397         struct rt6_info *res = NULL;
1398
1399         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1400
1401 #ifdef CONFIG_IPV6_SUBTREES
1402         /* rt6i_src.plen != 0 indicates rt is in subtree
1403          * and exception table is indexed by a hash of
1404          * both rt6i_dst and rt6i_src.
1405          * Otherwise, the exception table is indexed by
1406          * a hash of only rt6i_dst.
1407          */
1408         if (rt->rt6i_src.plen)
1409                 src_key = saddr;
1410 #endif
1411         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1412
1413         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1414                 res = rt6_ex->rt6i;
1415
1416         return res;
1417 }
1418
1419 /* Remove the passed in cached rt from the hash table that contains it */
1420 int rt6_remove_exception_rt(struct rt6_info *rt)
1421 {
1422         struct rt6_exception_bucket *bucket;
1423         struct rt6_info *from = rt->from;
1424         struct in6_addr *src_key = NULL;
1425         struct rt6_exception *rt6_ex;
1426         int err;
1427
1428         if (!from ||
1429             !(rt->rt6i_flags & RTF_CACHE))
1430                 return -EINVAL;
1431
1432         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1433                 return -ENOENT;
1434
1435         spin_lock_bh(&rt6_exception_lock);
1436         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1437                                     lockdep_is_held(&rt6_exception_lock));
1438 #ifdef CONFIG_IPV6_SUBTREES
1439         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1440          * and exception table is indexed by a hash of
1441          * both rt6i_dst and rt6i_src.
1442          * Otherwise, the exception table is indexed by
1443          * a hash of only rt6i_dst.
1444          */
1445         if (from->rt6i_src.plen)
1446                 src_key = &rt->rt6i_src.addr;
1447 #endif
1448         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1449                                                &rt->rt6i_dst.addr,
1450                                                src_key);
1451         if (rt6_ex) {
1452                 rt6_remove_exception(bucket, rt6_ex);
1453                 err = 0;
1454         } else {
1455                 err = -ENOENT;
1456         }
1457
1458         spin_unlock_bh(&rt6_exception_lock);
1459         return err;
1460 }
1461
1462 /* Find rt6_ex which contains the passed in rt cache and
1463  * refresh its stamp
1464  */
1465 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1466 {
1467         struct rt6_exception_bucket *bucket;
1468         struct rt6_info *from = rt->from;
1469         struct in6_addr *src_key = NULL;
1470         struct rt6_exception *rt6_ex;
1471
1472         if (!from ||
1473             !(rt->rt6i_flags & RTF_CACHE))
1474                 return;
1475
1476         rcu_read_lock();
1477         bucket = rcu_dereference(from->rt6i_exception_bucket);
1478
1479 #ifdef CONFIG_IPV6_SUBTREES
1480         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1481          * and exception table is indexed by a hash of
1482          * both rt6i_dst and rt6i_src.
1483          * Otherwise, the exception table is indexed by
1484          * a hash of only rt6i_dst.
1485          */
1486         if (from->rt6i_src.plen)
1487                 src_key = &rt->rt6i_src.addr;
1488 #endif
1489         rt6_ex = __rt6_find_exception_rcu(&bucket,
1490                                           &rt->rt6i_dst.addr,
1491                                           src_key);
1492         if (rt6_ex)
1493                 rt6_ex->stamp = jiffies;
1494
1495         rcu_read_unlock();
1496 }
1497
1498 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1499 {
1500         struct rt6_exception_bucket *bucket;
1501         struct rt6_exception *rt6_ex;
1502         int i;
1503
1504         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505                                         lockdep_is_held(&rt6_exception_lock));
1506
1507         if (bucket) {
1508                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1509                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1510                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1511                         }
1512                         bucket++;
1513                 }
1514         }
1515 }
1516
1517 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1518 {
1519         struct rt6_exception_bucket *bucket;
1520         struct rt6_exception *rt6_ex;
1521         int i;
1522
1523         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1524                                         lockdep_is_held(&rt6_exception_lock));
1525
1526         if (bucket) {
1527                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1528                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1529                                 struct rt6_info *entry = rt6_ex->rt6i;
1530                                 /* For RTF_CACHE with rt6i_pmtu == 0
1531                                  * (i.e. a redirected route),
1532                                  * the metrics of its rt->dst.from has already
1533                                  * been updated.
1534                                  */
1535                                 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1536                                         entry->rt6i_pmtu = mtu;
1537                         }
1538                         bucket++;
1539                 }
1540         }
1541 }
1542
1543 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1544
1545 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1546                                         struct in6_addr *gateway)
1547 {
1548         struct rt6_exception_bucket *bucket;
1549         struct rt6_exception *rt6_ex;
1550         struct hlist_node *tmp;
1551         int i;
1552
1553         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1554                 return;
1555
1556         spin_lock_bh(&rt6_exception_lock);
1557         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1558                                      lockdep_is_held(&rt6_exception_lock));
1559
1560         if (bucket) {
1561                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1562                         hlist_for_each_entry_safe(rt6_ex, tmp,
1563                                                   &bucket->chain, hlist) {
1564                                 struct rt6_info *entry = rt6_ex->rt6i;
1565
1566                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1567                                     RTF_CACHE_GATEWAY &&
1568                                     ipv6_addr_equal(gateway,
1569                                                     &entry->rt6i_gateway)) {
1570                                         rt6_remove_exception(bucket, rt6_ex);
1571                                 }
1572                         }
1573                         bucket++;
1574                 }
1575         }
1576
1577         spin_unlock_bh(&rt6_exception_lock);
1578 }
1579
1580 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1581                                       struct rt6_exception *rt6_ex,
1582                                       struct fib6_gc_args *gc_args,
1583                                       unsigned long now)
1584 {
1585         struct rt6_info *rt = rt6_ex->rt6i;
1586
1587         /* we are pruning and obsoleting aged-out and non gateway exceptions
1588          * even if others have still references to them, so that on next
1589          * dst_check() such references can be dropped.
1590          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1591          * expired, independently from their aging, as per RFC 8201 section 4
1592          */
1593         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1594                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1595                         RT6_TRACE("aging clone %p\n", rt);
1596                         rt6_remove_exception(bucket, rt6_ex);
1597                         return;
1598                 }
1599         } else if (time_after(jiffies, rt->dst.expires)) {
1600                 RT6_TRACE("purging expired route %p\n", rt);
1601                 rt6_remove_exception(bucket, rt6_ex);
1602                 return;
1603         }
1604
1605         if (rt->rt6i_flags & RTF_GATEWAY) {
1606                 struct neighbour *neigh;
1607                 __u8 neigh_flags = 0;
1608
1609                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1610                 if (neigh) {
1611                         neigh_flags = neigh->flags;
1612                         neigh_release(neigh);
1613                 }
1614                 if (!(neigh_flags & NTF_ROUTER)) {
1615                         RT6_TRACE("purging route %p via non-router but gateway\n",
1616                                   rt);
1617                         rt6_remove_exception(bucket, rt6_ex);
1618                         return;
1619                 }
1620         }
1621
1622         gc_args->more++;
1623 }
1624
1625 void rt6_age_exceptions(struct rt6_info *rt,
1626                         struct fib6_gc_args *gc_args,
1627                         unsigned long now)
1628 {
1629         struct rt6_exception_bucket *bucket;
1630         struct rt6_exception *rt6_ex;
1631         struct hlist_node *tmp;
1632         int i;
1633
1634         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1635                 return;
1636
1637         spin_lock_bh(&rt6_exception_lock);
1638         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1639                                     lockdep_is_held(&rt6_exception_lock));
1640
1641         if (bucket) {
1642                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1643                         hlist_for_each_entry_safe(rt6_ex, tmp,
1644                                                   &bucket->chain, hlist) {
1645                                 rt6_age_examine_exception(bucket, rt6_ex,
1646                                                           gc_args, now);
1647                         }
1648                         bucket++;
1649                 }
1650         }
1651         spin_unlock_bh(&rt6_exception_lock);
1652 }
1653
1654 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1655                                int oif, struct flowi6 *fl6,
1656                                const struct sk_buff *skb, int flags)
1657 {
1658         struct fib6_node *fn, *saved_fn;
1659         struct rt6_info *rt, *rt_cache;
1660         int strict = 0;
1661
1662         strict |= flags & RT6_LOOKUP_F_IFACE;
1663         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1664         if (net->ipv6.devconf_all->forwarding == 0)
1665                 strict |= RT6_LOOKUP_F_REACHABLE;
1666
1667         rcu_read_lock();
1668
1669         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1670         saved_fn = fn;
1671
1672         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1673                 oif = 0;
1674
1675 redo_rt6_select:
1676         rt = rt6_select(net, fn, oif, strict);
1677         if (rt->rt6i_nsiblings)
1678                 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1679         if (rt == net->ipv6.ip6_null_entry) {
1680                 fn = fib6_backtrack(fn, &fl6->saddr);
1681                 if (fn)
1682                         goto redo_rt6_select;
1683                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1684                         /* also consider unreachable route */
1685                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1686                         fn = saved_fn;
1687                         goto redo_rt6_select;
1688                 }
1689         }
1690
1691         /*Search through exception table */
1692         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1693         if (rt_cache)
1694                 rt = rt_cache;
1695
1696         if (rt == net->ipv6.ip6_null_entry) {
1697                 rcu_read_unlock();
1698                 dst_hold(&rt->dst);
1699                 trace_fib6_table_lookup(net, rt, table, fl6);
1700                 return rt;
1701         } else if (rt->rt6i_flags & RTF_CACHE) {
1702                 if (ip6_hold_safe(net, &rt, true)) {
1703                         dst_use_noref(&rt->dst, jiffies);
1704                         rt6_dst_from_metrics_check(rt);
1705                 }
1706                 rcu_read_unlock();
1707                 trace_fib6_table_lookup(net, rt, table, fl6);
1708                 return rt;
1709         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1710                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1711                 /* Create a RTF_CACHE clone which will not be
1712                  * owned by the fib6 tree.  It is for the special case where
1713                  * the daddr in the skb during the neighbor look-up is different
1714                  * from the fl6->daddr used to look-up route here.
1715                  */
1716
1717                 struct rt6_info *uncached_rt;
1718
1719                 if (ip6_hold_safe(net, &rt, true)) {
1720                         dst_use_noref(&rt->dst, jiffies);
1721                 } else {
1722                         rcu_read_unlock();
1723                         uncached_rt = rt;
1724                         goto uncached_rt_out;
1725                 }
1726                 rcu_read_unlock();
1727
1728                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1729                 dst_release(&rt->dst);
1730
1731                 if (uncached_rt) {
1732                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1733                          * No need for another dst_hold()
1734                          */
1735                         rt6_uncached_list_add(uncached_rt);
1736                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1737                 } else {
1738                         uncached_rt = net->ipv6.ip6_null_entry;
1739                         dst_hold(&uncached_rt->dst);
1740                 }
1741
1742 uncached_rt_out:
1743                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1744                 return uncached_rt;
1745
1746         } else {
1747                 /* Get a percpu copy */
1748
1749                 struct rt6_info *pcpu_rt;
1750
1751                 dst_use_noref(&rt->dst, jiffies);
1752                 local_bh_disable();
1753                 pcpu_rt = rt6_get_pcpu_route(rt);
1754
1755                 if (!pcpu_rt) {
1756                         /* atomic_inc_not_zero() is needed when using rcu */
1757                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1758                                 /* No dst_hold() on rt is needed because grabbing
1759                                  * rt->rt6i_ref makes sure rt can't be released.
1760                                  */
1761                                 pcpu_rt = rt6_make_pcpu_route(rt);
1762                                 rt6_release(rt);
1763                         } else {
1764                                 /* rt is already removed from tree */
1765                                 pcpu_rt = net->ipv6.ip6_null_entry;
1766                                 dst_hold(&pcpu_rt->dst);
1767                         }
1768                 }
1769                 local_bh_enable();
1770                 rcu_read_unlock();
1771                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1772                 return pcpu_rt;
1773         }
1774 }
1775 EXPORT_SYMBOL_GPL(ip6_pol_route);
1776
1777 static struct rt6_info *ip6_pol_route_input(struct net *net,
1778                                             struct fib6_table *table,
1779                                             struct flowi6 *fl6,
1780                                             const struct sk_buff *skb,
1781                                             int flags)
1782 {
1783         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1784 }
1785
1786 struct dst_entry *ip6_route_input_lookup(struct net *net,
1787                                          struct net_device *dev,
1788                                          struct flowi6 *fl6,
1789                                          const struct sk_buff *skb,
1790                                          int flags)
1791 {
1792         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1793                 flags |= RT6_LOOKUP_F_IFACE;
1794
1795         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1796 }
1797 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1798
1799 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1800                                   struct flow_keys *keys,
1801                                   struct flow_keys *flkeys)
1802 {
1803         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1804         const struct ipv6hdr *key_iph = outer_iph;
1805         struct flow_keys *_flkeys = flkeys;
1806         const struct ipv6hdr *inner_iph;
1807         const struct icmp6hdr *icmph;
1808         struct ipv6hdr _inner_iph;
1809
1810         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1811                 goto out;
1812
1813         icmph = icmp6_hdr(skb);
1814         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1815             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1816             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1817             icmph->icmp6_type != ICMPV6_PARAMPROB)
1818                 goto out;
1819
1820         inner_iph = skb_header_pointer(skb,
1821                                        skb_transport_offset(skb) + sizeof(*icmph),
1822                                        sizeof(_inner_iph), &_inner_iph);
1823         if (!inner_iph)
1824                 goto out;
1825
1826         key_iph = inner_iph;
1827         _flkeys = NULL;
1828 out:
1829         if (_flkeys) {
1830                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1831                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1832                 keys->tags.flow_label = _flkeys->tags.flow_label;
1833                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1834         } else {
1835                 keys->addrs.v6addrs.src = key_iph->saddr;
1836                 keys->addrs.v6addrs.dst = key_iph->daddr;
1837                 keys->tags.flow_label = ip6_flowinfo(key_iph);
1838                 keys->basic.ip_proto = key_iph->nexthdr;
1839         }
1840 }
1841
1842 /* if skb is set it will be used and fl6 can be NULL */
1843 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1844                        const struct sk_buff *skb, struct flow_keys *flkeys)
1845 {
1846         struct flow_keys hash_keys;
1847         u32 mhash;
1848
1849         switch (net->ipv6.sysctl.multipath_hash_policy) {
1850         case 0:
1851                 memset(&hash_keys, 0, sizeof(hash_keys));
1852                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1853                 if (skb) {
1854                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1855                 } else {
1856                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1857                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1858                         hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1859                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1860                 }
1861                 break;
1862         case 1:
1863                 if (skb) {
1864                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1865                         struct flow_keys keys;
1866
1867                         /* short-circuit if we already have L4 hash present */
1868                         if (skb->l4_hash)
1869                                 return skb_get_hash_raw(skb) >> 1;
1870
1871                         memset(&hash_keys, 0, sizeof(hash_keys));
1872
1873                         if (!flkeys) {
1874                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1875                                 flkeys = &keys;
1876                         }
1877                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1878                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1879                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1880                         hash_keys.ports.src = flkeys->ports.src;
1881                         hash_keys.ports.dst = flkeys->ports.dst;
1882                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1883                 } else {
1884                         memset(&hash_keys, 0, sizeof(hash_keys));
1885                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1886                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1887                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1888                         hash_keys.ports.src = fl6->fl6_sport;
1889                         hash_keys.ports.dst = fl6->fl6_dport;
1890                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1891                 }
1892                 break;
1893         }
1894         mhash = flow_hash_from_keys(&hash_keys);
1895
1896         return mhash >> 1;
1897 }
1898
1899 void ip6_route_input(struct sk_buff *skb)
1900 {
1901         const struct ipv6hdr *iph = ipv6_hdr(skb);
1902         struct net *net = dev_net(skb->dev);
1903         int flags = RT6_LOOKUP_F_HAS_SADDR;
1904         struct ip_tunnel_info *tun_info;
1905         struct flowi6 fl6 = {
1906                 .flowi6_iif = skb->dev->ifindex,
1907                 .daddr = iph->daddr,
1908                 .saddr = iph->saddr,
1909                 .flowlabel = ip6_flowinfo(iph),
1910                 .flowi6_mark = skb->mark,
1911                 .flowi6_proto = iph->nexthdr,
1912         };
1913         struct flow_keys *flkeys = NULL, _flkeys;
1914
1915         tun_info = skb_tunnel_info(skb);
1916         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1917                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1918
1919         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1920                 flkeys = &_flkeys;
1921
1922         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1923                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1924         skb_dst_drop(skb);
1925         skb_dst_set(skb,
1926                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1927 }
1928
1929 static struct rt6_info *ip6_pol_route_output(struct net *net,
1930                                              struct fib6_table *table,
1931                                              struct flowi6 *fl6,
1932                                              const struct sk_buff *skb,
1933                                              int flags)
1934 {
1935         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1936 }
1937
1938 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1939                                          struct flowi6 *fl6, int flags)
1940 {
1941         bool any_src;
1942
1943         if (rt6_need_strict(&fl6->daddr)) {
1944                 struct dst_entry *dst;
1945
1946                 dst = l3mdev_link_scope_lookup(net, fl6);
1947                 if (dst)
1948                         return dst;
1949         }
1950
1951         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1952
1953         any_src = ipv6_addr_any(&fl6->saddr);
1954         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1955             (fl6->flowi6_oif && any_src))
1956                 flags |= RT6_LOOKUP_F_IFACE;
1957
1958         if (!any_src)
1959                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1960         else if (sk)
1961                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1962
1963         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1964 }
1965 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1966
1967 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1968 {
1969         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1970         struct net_device *loopback_dev = net->loopback_dev;
1971         struct dst_entry *new = NULL;
1972
1973         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1974                        DST_OBSOLETE_DEAD, 0);
1975         if (rt) {
1976                 rt6_info_init(rt);
1977                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1978
1979                 new = &rt->dst;
1980                 new->__use = 1;
1981                 new->input = dst_discard;
1982                 new->output = dst_discard_out;
1983
1984                 dst_copy_metrics(new, &ort->dst);
1985
1986                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1987                 rt->rt6i_gateway = ort->rt6i_gateway;
1988                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1989                 rt->rt6i_metric = 0;
1990
1991                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1992 #ifdef CONFIG_IPV6_SUBTREES
1993                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1994 #endif
1995         }
1996
1997         dst_release(dst_orig);
1998         return new ? new : ERR_PTR(-ENOMEM);
1999 }
2000
2001 /*
2002  *      Destination cache support functions
2003  */
2004
2005 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2006 {
2007         if (rt->from &&
2008             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2009                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2010 }
2011
2012 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2013 {
2014         u32 rt_cookie = 0;
2015
2016         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2017                 return NULL;
2018
2019         if (rt6_check_expired(rt))
2020                 return NULL;
2021
2022         return &rt->dst;
2023 }
2024
2025 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2026 {
2027         if (!__rt6_check_expired(rt) &&
2028             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2029             rt6_check(rt->from, cookie))
2030                 return &rt->dst;
2031         else
2032                 return NULL;
2033 }
2034
2035 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2036 {
2037         struct rt6_info *rt;
2038
2039         rt = (struct rt6_info *) dst;
2040
2041         /* All IPV6 dsts are created with ->obsolete set to the value
2042          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2043          * into this function always.
2044          */
2045
2046         rt6_dst_from_metrics_check(rt);
2047
2048         if (rt->rt6i_flags & RTF_PCPU ||
2049             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2050                 return rt6_dst_from_check(rt, cookie);
2051         else
2052                 return rt6_check(rt, cookie);
2053 }
2054
2055 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2056 {
2057         struct rt6_info *rt = (struct rt6_info *) dst;
2058
2059         if (rt) {
2060                 if (rt->rt6i_flags & RTF_CACHE) {
2061                         if (rt6_check_expired(rt)) {
2062                                 ip6_del_rt(rt);
2063                                 dst = NULL;
2064                         }
2065                 } else {
2066                         dst_release(dst);
2067                         dst = NULL;
2068                 }
2069         }
2070         return dst;
2071 }
2072
2073 static void ip6_link_failure(struct sk_buff *skb)
2074 {
2075         struct rt6_info *rt;
2076
2077         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2078
2079         rt = (struct rt6_info *) skb_dst(skb);
2080         if (rt) {
2081                 if (rt->rt6i_flags & RTF_CACHE) {
2082                         if (dst_hold_safe(&rt->dst))
2083                                 ip6_del_rt(rt);
2084                 } else {
2085                         struct fib6_node *fn;
2086
2087                         rcu_read_lock();
2088                         fn = rcu_dereference(rt->rt6i_node);
2089                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2090                                 fn->fn_sernum = -1;
2091                         rcu_read_unlock();
2092                 }
2093         }
2094 }
2095
2096 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2097 {
2098         struct net *net = dev_net(rt->dst.dev);
2099
2100         rt->rt6i_flags |= RTF_MODIFIED;
2101         rt->rt6i_pmtu = mtu;
2102         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2103 }
2104
2105 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2106 {
2107         return !(rt->rt6i_flags & RTF_CACHE) &&
2108                 (rt->rt6i_flags & RTF_PCPU ||
2109                  rcu_access_pointer(rt->rt6i_node));
2110 }
2111
2112 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2113                                  const struct ipv6hdr *iph, u32 mtu)
2114 {
2115         const struct in6_addr *daddr, *saddr;
2116         struct rt6_info *rt6 = (struct rt6_info *)dst;
2117
2118         if (rt6->rt6i_flags & RTF_LOCAL)
2119                 return;
2120
2121         if (dst_metric_locked(dst, RTAX_MTU))
2122                 return;
2123
2124         if (iph) {
2125                 daddr = &iph->daddr;
2126                 saddr = &iph->saddr;
2127         } else if (sk) {
2128                 daddr = &sk->sk_v6_daddr;
2129                 saddr = &inet6_sk(sk)->saddr;
2130         } else {
2131                 daddr = NULL;
2132                 saddr = NULL;
2133         }
2134         dst_confirm_neigh(dst, daddr);
2135         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2136         if (mtu >= dst_mtu(dst))
2137                 return;
2138
2139         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2140                 rt6_do_update_pmtu(rt6, mtu);
2141                 /* update rt6_ex->stamp for cache */
2142                 if (rt6->rt6i_flags & RTF_CACHE)
2143                         rt6_update_exception_stamp_rt(rt6);
2144         } else if (daddr) {
2145                 struct rt6_info *nrt6;
2146
2147                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2148                 if (nrt6) {
2149                         rt6_do_update_pmtu(nrt6, mtu);
2150                         if (rt6_insert_exception(nrt6, rt6))
2151                                 dst_release_immediate(&nrt6->dst);
2152                 }
2153         }
2154 }
2155
2156 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2157                                struct sk_buff *skb, u32 mtu)
2158 {
2159         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2160 }
2161
2162 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2163                      int oif, u32 mark, kuid_t uid)
2164 {
2165         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2166         struct dst_entry *dst;
2167         struct flowi6 fl6;
2168
2169         memset(&fl6, 0, sizeof(fl6));
2170         fl6.flowi6_oif = oif;
2171         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2172         fl6.daddr = iph->daddr;
2173         fl6.saddr = iph->saddr;
2174         fl6.flowlabel = ip6_flowinfo(iph);
2175         fl6.flowi6_uid = uid;
2176
2177         dst = ip6_route_output(net, NULL, &fl6);
2178         if (!dst->error)
2179                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2180         dst_release(dst);
2181 }
2182 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2183
2184 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2185 {
2186         struct dst_entry *dst;
2187
2188         ip6_update_pmtu(skb, sock_net(sk), mtu,
2189                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2190
2191         dst = __sk_dst_get(sk);
2192         if (!dst || !dst->obsolete ||
2193             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2194                 return;
2195
2196         bh_lock_sock(sk);
2197         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2198                 ip6_datagram_dst_update(sk, false);
2199         bh_unlock_sock(sk);
2200 }
2201 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2202
2203 /* Handle redirects */
2204 struct ip6rd_flowi {
2205         struct flowi6 fl6;
2206         struct in6_addr gateway;
2207 };
2208
2209 static struct rt6_info *__ip6_route_redirect(struct net *net,
2210                                              struct fib6_table *table,
2211                                              struct flowi6 *fl6,
2212                                              const struct sk_buff *skb,
2213                                              int flags)
2214 {
2215         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2216         struct rt6_info *rt, *rt_cache;
2217         struct fib6_node *fn;
2218
2219         /* Get the "current" route for this destination and
2220          * check if the redirect has come from appropriate router.
2221          *
2222          * RFC 4861 specifies that redirects should only be
2223          * accepted if they come from the nexthop to the target.
2224          * Due to the way the routes are chosen, this notion
2225          * is a bit fuzzy and one might need to check all possible
2226          * routes.
2227          */
2228
2229         rcu_read_lock();
2230         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2231 restart:
2232         for_each_fib6_node_rt_rcu(fn) {
2233                 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2234                         continue;
2235                 if (rt6_check_expired(rt))
2236                         continue;
2237                 if (rt->dst.error)
2238                         break;
2239                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2240                         continue;
2241                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2242                         continue;
2243                 /* rt_cache's gateway might be different from its 'parent'
2244                  * in the case of an ip redirect.
2245                  * So we keep searching in the exception table if the gateway
2246                  * is different.
2247                  */
2248                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2249                         rt_cache = rt6_find_cached_rt(rt,
2250                                                       &fl6->daddr,
2251                                                       &fl6->saddr);
2252                         if (rt_cache &&
2253                             ipv6_addr_equal(&rdfl->gateway,
2254                                             &rt_cache->rt6i_gateway)) {
2255                                 rt = rt_cache;
2256                                 break;
2257                         }
2258                         continue;
2259                 }
2260                 break;
2261         }
2262
2263         if (!rt)
2264                 rt = net->ipv6.ip6_null_entry;
2265         else if (rt->dst.error) {
2266                 rt = net->ipv6.ip6_null_entry;
2267                 goto out;
2268         }
2269
2270         if (rt == net->ipv6.ip6_null_entry) {
2271                 fn = fib6_backtrack(fn, &fl6->saddr);
2272                 if (fn)
2273                         goto restart;
2274         }
2275
2276 out:
2277         ip6_hold_safe(net, &rt, true);
2278
2279         rcu_read_unlock();
2280
2281         trace_fib6_table_lookup(net, rt, table, fl6);
2282         return rt;
2283 };
2284
2285 static struct dst_entry *ip6_route_redirect(struct net *net,
2286                                             const struct flowi6 *fl6,
2287                                             const struct sk_buff *skb,
2288                                             const struct in6_addr *gateway)
2289 {
2290         int flags = RT6_LOOKUP_F_HAS_SADDR;
2291         struct ip6rd_flowi rdfl;
2292
2293         rdfl.fl6 = *fl6;
2294         rdfl.gateway = *gateway;
2295
2296         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2297                                 flags, __ip6_route_redirect);
2298 }
2299
2300 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2301                   kuid_t uid)
2302 {
2303         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2304         struct dst_entry *dst;
2305         struct flowi6 fl6;
2306
2307         memset(&fl6, 0, sizeof(fl6));
2308         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2309         fl6.flowi6_oif = oif;
2310         fl6.flowi6_mark = mark;
2311         fl6.daddr = iph->daddr;
2312         fl6.saddr = iph->saddr;
2313         fl6.flowlabel = ip6_flowinfo(iph);
2314         fl6.flowi6_uid = uid;
2315
2316         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2317         rt6_do_redirect(dst, NULL, skb);
2318         dst_release(dst);
2319 }
2320 EXPORT_SYMBOL_GPL(ip6_redirect);
2321
2322 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2323                             u32 mark)
2324 {
2325         const struct ipv6hdr *iph = ipv6_hdr(skb);
2326         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2327         struct dst_entry *dst;
2328         struct flowi6 fl6;
2329
2330         memset(&fl6, 0, sizeof(fl6));
2331         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2332         fl6.flowi6_oif = oif;
2333         fl6.flowi6_mark = mark;
2334         fl6.daddr = msg->dest;
2335         fl6.saddr = iph->daddr;
2336         fl6.flowi6_uid = sock_net_uid(net, NULL);
2337
2338         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2339         rt6_do_redirect(dst, NULL, skb);
2340         dst_release(dst);
2341 }
2342
2343 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2344 {
2345         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2346                      sk->sk_uid);
2347 }
2348 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2349
2350 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2351 {
2352         struct net_device *dev = dst->dev;
2353         unsigned int mtu = dst_mtu(dst);
2354         struct net *net = dev_net(dev);
2355
2356         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2357
2358         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2359                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2360
2361         /*
2362          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2363          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2364          * IPV6_MAXPLEN is also valid and means: "any MSS,
2365          * rely only on pmtu discovery"
2366          */
2367         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2368                 mtu = IPV6_MAXPLEN;
2369         return mtu;
2370 }
2371
2372 static unsigned int ip6_mtu(const struct dst_entry *dst)
2373 {
2374         const struct rt6_info *rt = (const struct rt6_info *)dst;
2375         unsigned int mtu = rt->rt6i_pmtu;
2376         struct inet6_dev *idev;
2377
2378         if (mtu)
2379                 goto out;
2380
2381         mtu = dst_metric_raw(dst, RTAX_MTU);
2382         if (mtu)
2383                 goto out;
2384
2385         mtu = IPV6_MIN_MTU;
2386
2387         rcu_read_lock();
2388         idev = __in6_dev_get(dst->dev);
2389         if (idev)
2390                 mtu = idev->cnf.mtu6;
2391         rcu_read_unlock();
2392
2393 out:
2394         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2395
2396         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2397 }
2398
2399 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2400                                   struct flowi6 *fl6)
2401 {
2402         struct dst_entry *dst;
2403         struct rt6_info *rt;
2404         struct inet6_dev *idev = in6_dev_get(dev);
2405         struct net *net = dev_net(dev);
2406
2407         if (unlikely(!idev))
2408                 return ERR_PTR(-ENODEV);
2409
2410         rt = ip6_dst_alloc(net, dev, 0);
2411         if (unlikely(!rt)) {
2412                 in6_dev_put(idev);
2413                 dst = ERR_PTR(-ENOMEM);
2414                 goto out;
2415         }
2416
2417         rt->dst.flags |= DST_HOST;
2418         rt->dst.input = ip6_input;
2419         rt->dst.output  = ip6_output;
2420         rt->rt6i_gateway  = fl6->daddr;
2421         rt->rt6i_dst.addr = fl6->daddr;
2422         rt->rt6i_dst.plen = 128;
2423         rt->rt6i_idev     = idev;
2424         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2425
2426         /* Add this dst into uncached_list so that rt6_disable_ip() can
2427          * do proper release of the net_device
2428          */
2429         rt6_uncached_list_add(rt);
2430         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2431
2432         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2433
2434 out:
2435         return dst;
2436 }
2437
2438 static int ip6_dst_gc(struct dst_ops *ops)
2439 {
2440         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2441         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2442         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2443         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2444         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2445         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2446         int entries;
2447
2448         entries = dst_entries_get_fast(ops);
2449         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2450             entries <= rt_max_size)
2451                 goto out;
2452
2453         net->ipv6.ip6_rt_gc_expire++;
2454         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2455         entries = dst_entries_get_slow(ops);
2456         if (entries < ops->gc_thresh)
2457                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2458 out:
2459         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2460         return entries > rt_max_size;
2461 }
2462
2463 static int ip6_convert_metrics(struct mx6_config *mxc,
2464                                const struct fib6_config *cfg)
2465 {
2466         struct net *net = cfg->fc_nlinfo.nl_net;
2467         bool ecn_ca = false;
2468         struct nlattr *nla;
2469         int remaining;
2470         u32 *mp;
2471
2472         if (!cfg->fc_mx)
2473                 return 0;
2474
2475         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2476         if (unlikely(!mp))
2477                 return -ENOMEM;
2478
2479         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2480                 int type = nla_type(nla);
2481                 u32 val;
2482
2483                 if (!type)
2484                         continue;
2485                 if (unlikely(type > RTAX_MAX))
2486                         goto err;
2487
2488                 if (type == RTAX_CC_ALGO) {
2489                         char tmp[TCP_CA_NAME_MAX];
2490
2491                         nla_strlcpy(tmp, nla, sizeof(tmp));
2492                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2493                         if (val == TCP_CA_UNSPEC)
2494                                 goto err;
2495                 } else {
2496                         val = nla_get_u32(nla);
2497                 }
2498                 if (type == RTAX_HOPLIMIT && val > 255)
2499                         val = 255;
2500                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2501                         goto err;
2502
2503                 mp[type - 1] = val;
2504                 __set_bit(type - 1, mxc->mx_valid);
2505         }
2506
2507         if (ecn_ca) {
2508                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2509                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2510         }
2511
2512         mxc->mx = mp;
2513         return 0;
2514  err:
2515         kfree(mp);
2516         return -EINVAL;
2517 }
2518
2519 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2520                                             struct fib6_config *cfg,
2521                                             const struct in6_addr *gw_addr,
2522                                             u32 tbid, int flags)
2523 {
2524         struct flowi6 fl6 = {
2525                 .flowi6_oif = cfg->fc_ifindex,
2526                 .daddr = *gw_addr,
2527                 .saddr = cfg->fc_prefsrc,
2528         };
2529         struct fib6_table *table;
2530         struct rt6_info *rt;
2531
2532         table = fib6_get_table(net, tbid);
2533         if (!table)
2534                 return NULL;
2535
2536         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2537                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2538
2539         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2540         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2541
2542         /* if table lookup failed, fall back to full lookup */
2543         if (rt == net->ipv6.ip6_null_entry) {
2544                 ip6_rt_put(rt);
2545                 rt = NULL;
2546         }
2547
2548         return rt;
2549 }
2550
2551 static int ip6_route_check_nh_onlink(struct net *net,
2552                                      struct fib6_config *cfg,
2553                                      struct net_device *dev,
2554                                      struct netlink_ext_ack *extack)
2555 {
2556         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2557         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2558         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2559         struct rt6_info *grt;
2560         int err;
2561
2562         err = 0;
2563         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2564         if (grt) {
2565                 if (!grt->dst.error &&
2566                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2567                         NL_SET_ERR_MSG(extack,
2568                                        "Nexthop has invalid gateway or device mismatch");
2569                         err = -EINVAL;
2570                 }
2571
2572                 ip6_rt_put(grt);
2573         }
2574
2575         return err;
2576 }
2577
2578 static int ip6_route_check_nh(struct net *net,
2579                               struct fib6_config *cfg,
2580                               struct net_device **_dev,
2581                               struct inet6_dev **idev)
2582 {
2583         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2584         struct net_device *dev = _dev ? *_dev : NULL;
2585         struct rt6_info *grt = NULL;
2586         int err = -EHOSTUNREACH;
2587
2588         if (cfg->fc_table) {
2589                 int flags = RT6_LOOKUP_F_IFACE;
2590
2591                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2592                                           cfg->fc_table, flags);
2593                 if (grt) {
2594                         if (grt->rt6i_flags & RTF_GATEWAY ||
2595                             (dev && dev != grt->dst.dev)) {
2596                                 ip6_rt_put(grt);
2597                                 grt = NULL;
2598                         }
2599                 }
2600         }
2601
2602         if (!grt)
2603                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2604
2605         if (!grt)
2606                 goto out;
2607
2608         if (dev) {
2609                 if (dev != grt->dst.dev) {
2610                         ip6_rt_put(grt);
2611                         goto out;
2612                 }
2613         } else {
2614                 *_dev = dev = grt->dst.dev;
2615                 *idev = grt->rt6i_idev;
2616                 dev_hold(dev);
2617                 in6_dev_hold(grt->rt6i_idev);
2618         }
2619
2620         if (!(grt->rt6i_flags & RTF_GATEWAY))
2621                 err = 0;
2622
2623         ip6_rt_put(grt);
2624
2625 out:
2626         return err;
2627 }
2628
2629 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2630                                               struct netlink_ext_ack *extack)
2631 {
2632         struct net *net = cfg->fc_nlinfo.nl_net;
2633         struct rt6_info *rt = NULL;
2634         struct net_device *dev = NULL;
2635         struct inet6_dev *idev = NULL;
2636         struct fib6_table *table;
2637         int addr_type;
2638         int err = -EINVAL;
2639
2640         /* RTF_PCPU is an internal flag; can not be set by userspace */
2641         if (cfg->fc_flags & RTF_PCPU) {
2642                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2643                 goto out;
2644         }
2645
2646         /* RTF_CACHE is an internal flag; can not be set by userspace */
2647         if (cfg->fc_flags & RTF_CACHE) {
2648                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2649                 goto out;
2650         }
2651
2652         if (cfg->fc_dst_len > 128) {
2653                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2654                 goto out;
2655         }
2656         if (cfg->fc_src_len > 128) {
2657                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2658                 goto out;
2659         }
2660 #ifndef CONFIG_IPV6_SUBTREES
2661         if (cfg->fc_src_len) {
2662                 NL_SET_ERR_MSG(extack,
2663                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2664                 goto out;
2665         }
2666 #endif
2667         if (cfg->fc_ifindex) {
2668                 err = -ENODEV;
2669                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2670                 if (!dev)
2671                         goto out;
2672                 idev = in6_dev_get(dev);
2673                 if (!idev)
2674                         goto out;
2675         }
2676
2677         if (cfg->fc_metric == 0)
2678                 cfg->fc_metric = IP6_RT_PRIO_USER;
2679
2680         if (cfg->fc_flags & RTNH_F_ONLINK) {
2681                 if (!dev) {
2682                         NL_SET_ERR_MSG(extack,
2683                                        "Nexthop device required for onlink");
2684                         err = -ENODEV;
2685                         goto out;
2686                 }
2687
2688                 if (!(dev->flags & IFF_UP)) {
2689                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2690                         err = -ENETDOWN;
2691                         goto out;
2692                 }
2693         }
2694
2695         err = -ENOBUFS;
2696         if (cfg->fc_nlinfo.nlh &&
2697             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2698                 table = fib6_get_table(net, cfg->fc_table);
2699                 if (!table) {
2700                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2701                         table = fib6_new_table(net, cfg->fc_table);
2702                 }
2703         } else {
2704                 table = fib6_new_table(net, cfg->fc_table);
2705         }
2706
2707         if (!table)
2708                 goto out;
2709
2710         rt = ip6_dst_alloc(net, NULL,
2711                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2712
2713         if (!rt) {
2714                 err = -ENOMEM;
2715                 goto out;
2716         }
2717
2718         if (cfg->fc_flags & RTF_EXPIRES)
2719                 rt6_set_expires(rt, jiffies +
2720                                 clock_t_to_jiffies(cfg->fc_expires));
2721         else
2722                 rt6_clean_expires(rt);
2723
2724         if (cfg->fc_protocol == RTPROT_UNSPEC)
2725                 cfg->fc_protocol = RTPROT_BOOT;
2726         rt->rt6i_protocol = cfg->fc_protocol;
2727
2728         addr_type = ipv6_addr_type(&cfg->fc_dst);
2729
2730         if (addr_type & IPV6_ADDR_MULTICAST)
2731                 rt->dst.input = ip6_mc_input;
2732         else if (cfg->fc_flags & RTF_LOCAL)
2733                 rt->dst.input = ip6_input;
2734         else
2735                 rt->dst.input = ip6_forward;
2736
2737         rt->dst.output = ip6_output;
2738
2739         if (cfg->fc_encap) {
2740                 struct lwtunnel_state *lwtstate;
2741
2742                 err = lwtunnel_build_state(cfg->fc_encap_type,
2743                                            cfg->fc_encap, AF_INET6, cfg,
2744                                            &lwtstate, extack);
2745                 if (err)
2746                         goto out;
2747                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2748                 lwtunnel_set_redirect(&rt->dst);
2749         }
2750
2751         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2752         rt->rt6i_dst.plen = cfg->fc_dst_len;
2753         if (rt->rt6i_dst.plen == 128)
2754                 rt->dst.flags |= DST_HOST;
2755
2756 #ifdef CONFIG_IPV6_SUBTREES
2757         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2758         rt->rt6i_src.plen = cfg->fc_src_len;
2759 #endif
2760
2761         rt->rt6i_metric = cfg->fc_metric;
2762         rt->rt6i_nh_weight = 1;
2763
2764         /* We cannot add true routes via loopback here,
2765            they would result in kernel looping; promote them to reject routes
2766          */
2767         if ((cfg->fc_flags & RTF_REJECT) ||
2768             (dev && (dev->flags & IFF_LOOPBACK) &&
2769              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2770              !(cfg->fc_flags & RTF_LOCAL))) {
2771                 /* hold loopback dev/idev if we haven't done so. */
2772                 if (dev != net->loopback_dev) {
2773                         if (dev) {
2774                                 dev_put(dev);
2775                                 in6_dev_put(idev);
2776                         }
2777                         dev = net->loopback_dev;
2778                         dev_hold(dev);
2779                         idev = in6_dev_get(dev);
2780                         if (!idev) {
2781                                 err = -ENODEV;
2782                                 goto out;
2783                         }
2784                 }
2785                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2786                 switch (cfg->fc_type) {
2787                 case RTN_BLACKHOLE:
2788                         rt->dst.error = -EINVAL;
2789                         rt->dst.output = dst_discard_out;
2790                         rt->dst.input = dst_discard;
2791                         break;
2792                 case RTN_PROHIBIT:
2793                         rt->dst.error = -EACCES;
2794                         rt->dst.output = ip6_pkt_prohibit_out;
2795                         rt->dst.input = ip6_pkt_prohibit;
2796                         break;
2797                 case RTN_THROW:
2798                 case RTN_UNREACHABLE:
2799                 default:
2800                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2801                                         : (cfg->fc_type == RTN_UNREACHABLE)
2802                                         ? -EHOSTUNREACH : -ENETUNREACH;
2803                         rt->dst.output = ip6_pkt_discard_out;
2804                         rt->dst.input = ip6_pkt_discard;
2805                         break;
2806                 }
2807                 goto install_route;
2808         }
2809
2810         if (cfg->fc_flags & RTF_GATEWAY) {
2811                 const struct in6_addr *gw_addr;
2812                 int gwa_type;
2813
2814                 gw_addr = &cfg->fc_gateway;
2815                 gwa_type = ipv6_addr_type(gw_addr);
2816
2817                 /* if gw_addr is local we will fail to detect this in case
2818                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2819                  * will return already-added prefix route via interface that
2820                  * prefix route was assigned to, which might be non-loopback.
2821                  */
2822                 err = -EINVAL;
2823                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2824                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2825                                             dev : NULL, 0, 0)) {
2826                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2827                         goto out;
2828                 }
2829                 rt->rt6i_gateway = *gw_addr;
2830
2831                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2832                         /* IPv6 strictly inhibits using not link-local
2833                            addresses as nexthop address.
2834                            Otherwise, router will not able to send redirects.
2835                            It is very good, but in some (rare!) circumstances
2836                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2837                            some exceptions. --ANK
2838                            We allow IPv4-mapped nexthops to support RFC4798-type
2839                            addressing
2840                          */
2841                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2842                                           IPV6_ADDR_MAPPED))) {
2843                                 NL_SET_ERR_MSG(extack,
2844                                                "Invalid gateway address");
2845                                 goto out;
2846                         }
2847
2848                         if (cfg->fc_flags & RTNH_F_ONLINK) {
2849                                 err = ip6_route_check_nh_onlink(net, cfg, dev,
2850                                                                 extack);
2851                         } else {
2852                                 err = ip6_route_check_nh(net, cfg, &dev, &idev);
2853                         }
2854                         if (err)
2855                                 goto out;
2856                 }
2857                 err = -EINVAL;
2858                 if (!dev) {
2859                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2860                         goto out;
2861                 } else if (dev->flags & IFF_LOOPBACK) {
2862                         NL_SET_ERR_MSG(extack,
2863                                        "Egress device can not be loopback device for this route");
2864                         goto out;
2865                 }
2866         }
2867
2868         err = -ENODEV;
2869         if (!dev)
2870                 goto out;
2871
2872         if (!(dev->flags & IFF_UP)) {
2873                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2874                 err = -ENETDOWN;
2875                 goto out;
2876         }
2877
2878         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2879                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2880                         NL_SET_ERR_MSG(extack, "Invalid source address");
2881                         err = -EINVAL;
2882                         goto out;
2883                 }
2884                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2885                 rt->rt6i_prefsrc.plen = 128;
2886         } else
2887                 rt->rt6i_prefsrc.plen = 0;
2888
2889         rt->rt6i_flags = cfg->fc_flags;
2890
2891 install_route:
2892         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2893             !netif_carrier_ok(dev))
2894                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2895         rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2896         rt->dst.dev = dev;
2897         rt->rt6i_idev = idev;
2898         rt->rt6i_table = table;
2899
2900         cfg->fc_nlinfo.nl_net = dev_net(dev);
2901
2902         return rt;
2903 out:
2904         if (dev)
2905                 dev_put(dev);
2906         if (idev)
2907                 in6_dev_put(idev);
2908         if (rt)
2909                 dst_release_immediate(&rt->dst);
2910
2911         return ERR_PTR(err);
2912 }
2913
2914 int ip6_route_add(struct fib6_config *cfg,
2915                   struct netlink_ext_ack *extack)
2916 {
2917         struct mx6_config mxc = { .mx = NULL, };
2918         struct rt6_info *rt;
2919         int err;
2920
2921         rt = ip6_route_info_create(cfg, extack);
2922         if (IS_ERR(rt)) {
2923                 err = PTR_ERR(rt);
2924                 rt = NULL;
2925                 goto out;
2926         }
2927
2928         err = ip6_convert_metrics(&mxc, cfg);
2929         if (err)
2930                 goto out;
2931
2932         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2933
2934         kfree(mxc.mx);
2935
2936         return err;
2937 out:
2938         if (rt)
2939                 dst_release_immediate(&rt->dst);
2940
2941         return err;
2942 }
2943
2944 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2945 {
2946         int err;
2947         struct fib6_table *table;
2948         struct net *net = dev_net(rt->dst.dev);
2949
2950         if (rt == net->ipv6.ip6_null_entry) {
2951                 err = -ENOENT;
2952                 goto out;
2953         }
2954
2955         table = rt->rt6i_table;
2956         spin_lock_bh(&table->tb6_lock);
2957         err = fib6_del(rt, info);
2958         spin_unlock_bh(&table->tb6_lock);
2959
2960 out:
2961         ip6_rt_put(rt);
2962         return err;
2963 }
2964
2965 int ip6_del_rt(struct rt6_info *rt)
2966 {
2967         struct nl_info info = {
2968                 .nl_net = dev_net(rt->dst.dev),
2969         };
2970         return __ip6_del_rt(rt, &info);
2971 }
2972
2973 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2974 {
2975         struct nl_info *info = &cfg->fc_nlinfo;
2976         struct net *net = info->nl_net;
2977         struct sk_buff *skb = NULL;
2978         struct fib6_table *table;
2979         int err = -ENOENT;
2980
2981         if (rt == net->ipv6.ip6_null_entry)
2982                 goto out_put;
2983         table = rt->rt6i_table;
2984         spin_lock_bh(&table->tb6_lock);
2985
2986         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2987                 struct rt6_info *sibling, *next_sibling;
2988
2989                 /* prefer to send a single notification with all hops */
2990                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2991                 if (skb) {
2992                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2993
2994                         if (rt6_fill_node(net, skb, rt,
2995                                           NULL, NULL, 0, RTM_DELROUTE,
2996                                           info->portid, seq, 0) < 0) {
2997                                 kfree_skb(skb);
2998                                 skb = NULL;
2999                         } else
3000                                 info->skip_notify = 1;
3001                 }
3002
3003                 list_for_each_entry_safe(sibling, next_sibling,
3004                                          &rt->rt6i_siblings,
3005                                          rt6i_siblings) {
3006                         err = fib6_del(sibling, info);
3007                         if (err)
3008                                 goto out_unlock;
3009                 }
3010         }
3011
3012         err = fib6_del(rt, info);
3013 out_unlock:
3014         spin_unlock_bh(&table->tb6_lock);
3015 out_put:
3016         ip6_rt_put(rt);
3017
3018         if (skb) {
3019                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3020                             info->nlh, gfp_any());
3021         }
3022         return err;
3023 }
3024
3025 static int ip6_route_del(struct fib6_config *cfg,
3026                          struct netlink_ext_ack *extack)
3027 {
3028         struct rt6_info *rt, *rt_cache;
3029         struct fib6_table *table;
3030         struct fib6_node *fn;
3031         int err = -ESRCH;
3032
3033         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3034         if (!table) {
3035                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3036                 return err;
3037         }
3038
3039         rcu_read_lock();
3040
3041         fn = fib6_locate(&table->tb6_root,
3042                          &cfg->fc_dst, cfg->fc_dst_len,
3043                          &cfg->fc_src, cfg->fc_src_len,
3044                          !(cfg->fc_flags & RTF_CACHE));
3045
3046         if (fn) {
3047                 for_each_fib6_node_rt_rcu(fn) {
3048                         if (cfg->fc_flags & RTF_CACHE) {
3049                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3050                                                               &cfg->fc_src);
3051                                 if (!rt_cache)
3052                                         continue;
3053                                 rt = rt_cache;
3054                         }
3055                         if (cfg->fc_ifindex &&
3056                             (!rt->dst.dev ||
3057                              rt->dst.dev->ifindex != cfg->fc_ifindex))
3058                                 continue;
3059                         if (cfg->fc_flags & RTF_GATEWAY &&
3060                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3061                                 continue;
3062                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3063                                 continue;
3064                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3065                                 continue;
3066                         if (!dst_hold_safe(&rt->dst))
3067                                 break;
3068                         rcu_read_unlock();
3069
3070                         /* if gateway was specified only delete the one hop */
3071                         if (cfg->fc_flags & RTF_GATEWAY)
3072                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3073
3074                         return __ip6_del_rt_siblings(rt, cfg);
3075                 }
3076         }
3077         rcu_read_unlock();
3078
3079         return err;
3080 }
3081
3082 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3083 {
3084         struct netevent_redirect netevent;
3085         struct rt6_info *rt, *nrt = NULL;
3086         struct ndisc_options ndopts;
3087         struct inet6_dev *in6_dev;
3088         struct neighbour *neigh;
3089         struct rd_msg *msg;
3090         int optlen, on_link;
3091         u8 *lladdr;
3092
3093         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3094         optlen -= sizeof(*msg);
3095
3096         if (optlen < 0) {
3097                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3098                 return;
3099         }
3100
3101         msg = (struct rd_msg *)icmp6_hdr(skb);
3102
3103         if (ipv6_addr_is_multicast(&msg->dest)) {
3104                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3105                 return;
3106         }
3107
3108         on_link = 0;
3109         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3110                 on_link = 1;
3111         } else if (ipv6_addr_type(&msg->target) !=
3112                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3113                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3114                 return;
3115         }
3116
3117         in6_dev = __in6_dev_get(skb->dev);
3118         if (!in6_dev)
3119                 return;
3120         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3121                 return;
3122
3123         /* RFC2461 8.1:
3124          *      The IP source address of the Redirect MUST be the same as the current
3125          *      first-hop router for the specified ICMP Destination Address.
3126          */
3127
3128         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3129                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3130                 return;
3131         }
3132
3133         lladdr = NULL;
3134         if (ndopts.nd_opts_tgt_lladdr) {
3135                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3136                                              skb->dev);
3137                 if (!lladdr) {
3138                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3139                         return;
3140                 }
3141         }
3142
3143         rt = (struct rt6_info *) dst;
3144         if (rt->rt6i_flags & RTF_REJECT) {
3145                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3146                 return;
3147         }
3148
3149         /* Redirect received -> path was valid.
3150          * Look, redirects are sent only in response to data packets,
3151          * so that this nexthop apparently is reachable. --ANK
3152          */
3153         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3154
3155         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3156         if (!neigh)
3157                 return;
3158
3159         /*
3160          *      We have finally decided to accept it.
3161          */
3162
3163         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3164                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3165                      NEIGH_UPDATE_F_OVERRIDE|
3166                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3167                                      NEIGH_UPDATE_F_ISROUTER)),
3168                      NDISC_REDIRECT, &ndopts);
3169
3170         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3171         if (!nrt)
3172                 goto out;
3173
3174         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3175         if (on_link)
3176                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3177
3178         nrt->rt6i_protocol = RTPROT_REDIRECT;
3179         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3180
3181         /* No need to remove rt from the exception table if rt is
3182          * a cached route because rt6_insert_exception() will
3183          * takes care of it
3184          */
3185         if (rt6_insert_exception(nrt, rt)) {
3186                 dst_release_immediate(&nrt->dst);
3187                 goto out;
3188         }
3189
3190         netevent.old = &rt->dst;
3191         netevent.new = &nrt->dst;
3192         netevent.daddr = &msg->dest;
3193         netevent.neigh = neigh;
3194         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3195
3196 out:
3197         neigh_release(neigh);
3198 }
3199
3200 /*
3201  *      Misc support functions
3202  */
3203
3204 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3205 {
3206         BUG_ON(from->from);
3207
3208         rt->rt6i_flags &= ~RTF_EXPIRES;
3209         dst_hold(&from->dst);
3210         rt->from = from;
3211         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3212 }
3213
3214 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3215 {
3216         rt->dst.input = ort->dst.input;
3217         rt->dst.output = ort->dst.output;
3218         rt->rt6i_dst = ort->rt6i_dst;
3219         rt->dst.error = ort->dst.error;
3220         rt->rt6i_idev = ort->rt6i_idev;
3221         if (rt->rt6i_idev)
3222                 in6_dev_hold(rt->rt6i_idev);
3223         rt->dst.lastuse = jiffies;
3224         rt->rt6i_gateway = ort->rt6i_gateway;
3225         rt->rt6i_flags = ort->rt6i_flags;
3226         rt6_set_from(rt, ort);
3227         rt->rt6i_metric = ort->rt6i_metric;
3228 #ifdef CONFIG_IPV6_SUBTREES
3229         rt->rt6i_src = ort->rt6i_src;
3230 #endif
3231         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3232         rt->rt6i_table = ort->rt6i_table;
3233         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3234 }
3235
3236 #ifdef CONFIG_IPV6_ROUTE_INFO
3237 static struct rt6_info *rt6_get_route_info(struct net *net,
3238                                            const struct in6_addr *prefix, int prefixlen,
3239                                            const struct in6_addr *gwaddr,
3240                                            struct net_device *dev)
3241 {
3242         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3243         int ifindex = dev->ifindex;
3244         struct fib6_node *fn;
3245         struct rt6_info *rt = NULL;
3246         struct fib6_table *table;
3247
3248         table = fib6_get_table(net, tb_id);
3249         if (!table)
3250                 return NULL;
3251
3252         rcu_read_lock();
3253         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3254         if (!fn)
3255                 goto out;
3256
3257         for_each_fib6_node_rt_rcu(fn) {
3258                 if (rt->dst.dev->ifindex != ifindex)
3259                         continue;
3260                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3261                         continue;
3262                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3263                         continue;
3264                 ip6_hold_safe(NULL, &rt, false);
3265                 break;
3266         }
3267 out:
3268         rcu_read_unlock();
3269         return rt;
3270 }
3271
3272 static struct rt6_info *rt6_add_route_info(struct net *net,
3273                                            const struct in6_addr *prefix, int prefixlen,
3274                                            const struct in6_addr *gwaddr,
3275                                            struct net_device *dev,
3276                                            unsigned int pref)
3277 {
3278         struct fib6_config cfg = {
3279                 .fc_metric      = IP6_RT_PRIO_USER,
3280                 .fc_ifindex     = dev->ifindex,
3281                 .fc_dst_len     = prefixlen,
3282                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3283                                   RTF_UP | RTF_PREF(pref),
3284                 .fc_protocol = RTPROT_RA,
3285                 .fc_nlinfo.portid = 0,
3286                 .fc_nlinfo.nlh = NULL,
3287                 .fc_nlinfo.nl_net = net,
3288         };
3289
3290         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3291         cfg.fc_dst = *prefix;
3292         cfg.fc_gateway = *gwaddr;
3293
3294         /* We should treat it as a default route if prefix length is 0. */
3295         if (!prefixlen)
3296                 cfg.fc_flags |= RTF_DEFAULT;
3297
3298         ip6_route_add(&cfg, NULL);
3299
3300         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3301 }
3302 #endif
3303
3304 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3305 {
3306         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3307         struct rt6_info *rt;
3308         struct fib6_table *table;
3309
3310         table = fib6_get_table(dev_net(dev), tb_id);
3311         if (!table)
3312                 return NULL;
3313
3314         rcu_read_lock();
3315         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3316                 if (dev == rt->dst.dev &&
3317                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3318                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3319                         break;
3320         }
3321         if (rt)
3322                 ip6_hold_safe(NULL, &rt, false);
3323         rcu_read_unlock();
3324         return rt;
3325 }
3326
3327 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3328                                      struct net_device *dev,
3329                                      unsigned int pref)
3330 {
3331         struct fib6_config cfg = {
3332                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3333                 .fc_metric      = IP6_RT_PRIO_USER,
3334                 .fc_ifindex     = dev->ifindex,
3335                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3336                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3337                 .fc_protocol = RTPROT_RA,
3338                 .fc_nlinfo.portid = 0,
3339                 .fc_nlinfo.nlh = NULL,
3340                 .fc_nlinfo.nl_net = dev_net(dev),
3341         };
3342
3343         cfg.fc_gateway = *gwaddr;
3344
3345         if (!ip6_route_add(&cfg, NULL)) {
3346                 struct fib6_table *table;
3347
3348                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3349                 if (table)
3350                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3351         }
3352
3353         return rt6_get_dflt_router(gwaddr, dev);
3354 }
3355
3356 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3357 {
3358         struct rt6_info *rt;
3359
3360 restart:
3361         rcu_read_lock();
3362         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3363                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3364                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3365                         if (dst_hold_safe(&rt->dst)) {
3366                                 rcu_read_unlock();
3367                                 ip6_del_rt(rt);
3368                         } else {
3369                                 rcu_read_unlock();
3370                         }
3371                         goto restart;
3372                 }
3373         }
3374         rcu_read_unlock();
3375
3376         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3377 }
3378
3379 void rt6_purge_dflt_routers(struct net *net)
3380 {
3381         struct fib6_table *table;
3382         struct hlist_head *head;
3383         unsigned int h;
3384
3385         rcu_read_lock();
3386
3387         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3388                 head = &net->ipv6.fib_table_hash[h];
3389                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3390                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3391                                 __rt6_purge_dflt_routers(table);
3392                 }
3393         }
3394
3395         rcu_read_unlock();
3396 }
3397
3398 static void rtmsg_to_fib6_config(struct net *net,
3399                                  struct in6_rtmsg *rtmsg,
3400                                  struct fib6_config *cfg)
3401 {
3402         memset(cfg, 0, sizeof(*cfg));
3403
3404         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3405                          : RT6_TABLE_MAIN;
3406         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3407         cfg->fc_metric = rtmsg->rtmsg_metric;
3408         cfg->fc_expires = rtmsg->rtmsg_info;
3409         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3410         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3411         cfg->fc_flags = rtmsg->rtmsg_flags;
3412
3413         cfg->fc_nlinfo.nl_net = net;
3414
3415         cfg->fc_dst = rtmsg->rtmsg_dst;
3416         cfg->fc_src = rtmsg->rtmsg_src;
3417         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3418 }
3419
3420 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3421 {
3422         struct fib6_config cfg;
3423         struct in6_rtmsg rtmsg;
3424         int err;
3425
3426         switch (cmd) {
3427         case SIOCADDRT:         /* Add a route */
3428         case SIOCDELRT:         /* Delete a route */
3429                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3430                         return -EPERM;
3431                 err = copy_from_user(&rtmsg, arg,
3432                                      sizeof(struct in6_rtmsg));
3433                 if (err)
3434                         return -EFAULT;
3435
3436                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3437
3438                 rtnl_lock();
3439                 switch (cmd) {
3440                 case SIOCADDRT:
3441                         err = ip6_route_add(&cfg, NULL);
3442                         break;
3443                 case SIOCDELRT:
3444                         err = ip6_route_del(&cfg, NULL);
3445                         break;
3446                 default:
3447                         err = -EINVAL;
3448                 }
3449                 rtnl_unlock();
3450
3451                 return err;
3452         }
3453
3454         return -EINVAL;
3455 }
3456
3457 /*
3458  *      Drop the packet on the floor
3459  */
3460
3461 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3462 {
3463         int type;
3464         struct dst_entry *dst = skb_dst(skb);
3465         switch (ipstats_mib_noroutes) {
3466         case IPSTATS_MIB_INNOROUTES:
3467                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3468                 if (type == IPV6_ADDR_ANY) {
3469                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3470                                       IPSTATS_MIB_INADDRERRORS);
3471                         break;
3472                 }
3473                 /* FALLTHROUGH */
3474         case IPSTATS_MIB_OUTNOROUTES:
3475                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3476                               ipstats_mib_noroutes);
3477                 break;
3478         }
3479         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3480         kfree_skb(skb);
3481         return 0;
3482 }
3483
3484 static int ip6_pkt_discard(struct sk_buff *skb)
3485 {
3486         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3487 }
3488
3489 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3490 {
3491         skb->dev = skb_dst(skb)->dev;
3492         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3493 }
3494
3495 static int ip6_pkt_prohibit(struct sk_buff *skb)
3496 {
3497         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3498 }
3499
3500 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3501 {
3502         skb->dev = skb_dst(skb)->dev;
3503         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3504 }
3505
3506 /*
3507  *      Allocate a dst for local (unicast / anycast) address.
3508  */
3509
3510 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3511                                     const struct in6_addr *addr,
3512                                     bool anycast)
3513 {
3514         u32 tb_id;
3515         struct net *net = dev_net(idev->dev);
3516         struct net_device *dev = idev->dev;
3517         struct rt6_info *rt;
3518
3519         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3520         if (!rt)
3521                 return ERR_PTR(-ENOMEM);
3522
3523         in6_dev_hold(idev);
3524
3525         rt->dst.flags |= DST_HOST;
3526         rt->dst.input = ip6_input;
3527         rt->dst.output = ip6_output;
3528         rt->rt6i_idev = idev;
3529
3530         rt->rt6i_protocol = RTPROT_KERNEL;
3531         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3532         if (anycast)
3533                 rt->rt6i_flags |= RTF_ANYCAST;
3534         else
3535                 rt->rt6i_flags |= RTF_LOCAL;
3536
3537         rt->rt6i_gateway  = *addr;
3538         rt->rt6i_dst.addr = *addr;
3539         rt->rt6i_dst.plen = 128;
3540         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3541         rt->rt6i_table = fib6_get_table(net, tb_id);
3542
3543         return rt;
3544 }
3545
3546 /* remove deleted ip from prefsrc entries */
3547 struct arg_dev_net_ip {
3548         struct net_device *dev;
3549         struct net *net;
3550         struct in6_addr *addr;
3551 };
3552
3553 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3554 {
3555         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3556         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3557         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3558
3559         if (((void *)rt->dst.dev == dev || !dev) &&
3560             rt != net->ipv6.ip6_null_entry &&
3561             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3562                 spin_lock_bh(&rt6_exception_lock);
3563                 /* remove prefsrc entry */
3564                 rt->rt6i_prefsrc.plen = 0;
3565                 /* need to update cache as well */
3566                 rt6_exceptions_remove_prefsrc(rt);
3567                 spin_unlock_bh(&rt6_exception_lock);
3568         }
3569         return 0;
3570 }
3571
3572 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3573 {
3574         struct net *net = dev_net(ifp->idev->dev);
3575         struct arg_dev_net_ip adni = {
3576                 .dev = ifp->idev->dev,
3577                 .net = net,
3578                 .addr = &ifp->addr,
3579         };
3580         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3581 }
3582
3583 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3584
3585 /* Remove routers and update dst entries when gateway turn into host. */
3586 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3587 {
3588         struct in6_addr *gateway = (struct in6_addr *)arg;
3589
3590         if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3591             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3592                 return -1;
3593         }
3594
3595         /* Further clean up cached routes in exception table.
3596          * This is needed because cached route may have a different
3597          * gateway than its 'parent' in the case of an ip redirect.
3598          */
3599         rt6_exceptions_clean_tohost(rt, gateway);
3600
3601         return 0;
3602 }
3603
3604 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3605 {
3606         fib6_clean_all(net, fib6_clean_tohost, gateway);
3607 }
3608
3609 struct arg_netdev_event {
3610         const struct net_device *dev;
3611         union {
3612                 unsigned int nh_flags;
3613                 unsigned long event;
3614         };
3615 };
3616
3617 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3618 {
3619         struct rt6_info *iter;
3620         struct fib6_node *fn;
3621
3622         fn = rcu_dereference_protected(rt->rt6i_node,
3623                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3624         iter = rcu_dereference_protected(fn->leaf,
3625                         lockdep_is_held(&rt->rt6i_table->tb6_lock));
3626         while (iter) {
3627                 if (iter->rt6i_metric == rt->rt6i_metric &&
3628                     rt6_qualify_for_ecmp(iter))
3629                         return iter;
3630                 iter = rcu_dereference_protected(iter->rt6_next,
3631                                 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3632         }
3633
3634         return NULL;
3635 }
3636
3637 static bool rt6_is_dead(const struct rt6_info *rt)
3638 {
3639         if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3640             (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3641              rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3642                 return true;
3643
3644         return false;
3645 }
3646
3647 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3648 {
3649         struct rt6_info *iter;
3650         int total = 0;
3651
3652         if (!rt6_is_dead(rt))
3653                 total += rt->rt6i_nh_weight;
3654
3655         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3656                 if (!rt6_is_dead(iter))
3657                         total += iter->rt6i_nh_weight;
3658         }
3659
3660         return total;
3661 }
3662
3663 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3664 {
3665         int upper_bound = -1;
3666
3667         if (!rt6_is_dead(rt)) {
3668                 *weight += rt->rt6i_nh_weight;
3669                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3670                                                     total) - 1;
3671         }
3672         atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3673 }
3674
3675 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3676 {
3677         struct rt6_info *iter;
3678         int weight = 0;
3679
3680         rt6_upper_bound_set(rt, &weight, total);
3681
3682         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3683                 rt6_upper_bound_set(iter, &weight, total);
3684 }
3685
3686 void rt6_multipath_rebalance(struct rt6_info *rt)
3687 {
3688         struct rt6_info *first;
3689         int total;
3690
3691         /* In case the entire multipath route was marked for flushing,
3692          * then there is no need to rebalance upon the removal of every
3693          * sibling route.
3694          */
3695         if (!rt->rt6i_nsiblings || rt->should_flush)
3696                 return;
3697
3698         /* During lookup routes are evaluated in order, so we need to
3699          * make sure upper bounds are assigned from the first sibling
3700          * onwards.
3701          */
3702         first = rt6_multipath_first_sibling(rt);
3703         if (WARN_ON_ONCE(!first))
3704                 return;
3705
3706         total = rt6_multipath_total_weight(first);
3707         rt6_multipath_upper_bound_set(first, total);
3708 }
3709
3710 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3711 {
3712         const struct arg_netdev_event *arg = p_arg;
3713         const struct net *net = dev_net(arg->dev);
3714
3715         if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3716                 rt->rt6i_nh_flags &= ~arg->nh_flags;
3717                 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3718                 rt6_multipath_rebalance(rt);
3719         }
3720
3721         return 0;
3722 }
3723
3724 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3725 {
3726         struct arg_netdev_event arg = {
3727                 .dev = dev,
3728                 {
3729                         .nh_flags = nh_flags,
3730                 },
3731         };
3732
3733         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3734                 arg.nh_flags |= RTNH_F_LINKDOWN;
3735
3736         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3737 }
3738
3739 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3740                                    const struct net_device *dev)
3741 {
3742         struct rt6_info *iter;
3743
3744         if (rt->dst.dev == dev)
3745                 return true;
3746         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3747                 if (iter->dst.dev == dev)
3748                         return true;
3749
3750         return false;
3751 }
3752
3753 static void rt6_multipath_flush(struct rt6_info *rt)
3754 {
3755         struct rt6_info *iter;
3756
3757         rt->should_flush = 1;
3758         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3759                 iter->should_flush = 1;
3760 }
3761
3762 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3763                                              const struct net_device *down_dev)
3764 {
3765         struct rt6_info *iter;
3766         unsigned int dead = 0;
3767
3768         if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3769                 dead++;
3770         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3771                 if (iter->dst.dev == down_dev ||
3772                     iter->rt6i_nh_flags & RTNH_F_DEAD)
3773                         dead++;
3774
3775         return dead;
3776 }
3777
3778 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3779                                        const struct net_device *dev,
3780                                        unsigned int nh_flags)
3781 {
3782         struct rt6_info *iter;
3783
3784         if (rt->dst.dev == dev)
3785                 rt->rt6i_nh_flags |= nh_flags;
3786         list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3787                 if (iter->dst.dev == dev)
3788                         iter->rt6i_nh_flags |= nh_flags;
3789 }
3790
3791 /* called with write lock held for table with rt */
3792 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3793 {
3794         const struct arg_netdev_event *arg = p_arg;
3795         const struct net_device *dev = arg->dev;
3796         const struct net *net = dev_net(dev);
3797
3798         if (rt == net->ipv6.ip6_null_entry)
3799                 return 0;
3800
3801         switch (arg->event) {
3802         case NETDEV_UNREGISTER:
3803                 return rt->dst.dev == dev ? -1 : 0;
3804         case NETDEV_DOWN:
3805                 if (rt->should_flush)
3806                         return -1;
3807                 if (!rt->rt6i_nsiblings)
3808                         return rt->dst.dev == dev ? -1 : 0;
3809                 if (rt6_multipath_uses_dev(rt, dev)) {
3810                         unsigned int count;
3811
3812                         count = rt6_multipath_dead_count(rt, dev);
3813                         if (rt->rt6i_nsiblings + 1 == count) {
3814                                 rt6_multipath_flush(rt);
3815                                 return -1;
3816                         }
3817                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3818                                                    RTNH_F_LINKDOWN);
3819                         fib6_update_sernum(rt);
3820                         rt6_multipath_rebalance(rt);
3821                 }
3822                 return -2;
3823         case NETDEV_CHANGE:
3824                 if (rt->dst.dev != dev ||
3825                     rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3826                         break;
3827                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3828                 rt6_multipath_rebalance(rt);
3829                 break;
3830         }
3831
3832         return 0;
3833 }
3834
3835 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3836 {
3837         struct arg_netdev_event arg = {
3838                 .dev = dev,
3839                 {
3840                         .event = event,
3841                 },
3842         };
3843
3844         fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3845 }
3846
3847 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3848 {
3849         rt6_sync_down_dev(dev, event);
3850         rt6_uncached_list_flush_dev(dev_net(dev), dev);
3851         neigh_ifdown(&nd_tbl, dev);
3852 }
3853
3854 struct rt6_mtu_change_arg {
3855         struct net_device *dev;
3856         unsigned int mtu;
3857 };
3858
3859 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3860 {
3861         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3862         struct inet6_dev *idev;
3863
3864         /* In IPv6 pmtu discovery is not optional,
3865            so that RTAX_MTU lock cannot disable it.
3866            We still use this lock to block changes
3867            caused by addrconf/ndisc.
3868         */
3869
3870         idev = __in6_dev_get(arg->dev);
3871         if (!idev)
3872                 return 0;
3873
3874         /* For administrative MTU increase, there is no way to discover
3875            IPv6 PMTU increase, so PMTU increase should be updated here.
3876            Since RFC 1981 doesn't include administrative MTU increase
3877            update PMTU increase is a MUST. (i.e. jumbo frame)
3878          */
3879         /*
3880            If new MTU is less than route PMTU, this new MTU will be the
3881            lowest MTU in the path, update the route PMTU to reflect PMTU
3882            decreases; if new MTU is greater than route PMTU, and the
3883            old MTU is the lowest MTU in the path, update the route PMTU
3884            to reflect the increase. In this case if the other nodes' MTU
3885            also have the lowest MTU, TOO BIG MESSAGE will be lead to
3886            PMTU discovery.
3887          */
3888         if (rt->dst.dev == arg->dev &&
3889             dst_metric_raw(&rt->dst, RTAX_MTU) &&
3890             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3891                 spin_lock_bh(&rt6_exception_lock);
3892                 if (dst_mtu(&rt->dst) >= arg->mtu ||
3893                     (dst_mtu(&rt->dst) < arg->mtu &&
3894                      dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3895                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3896                 }
3897                 rt6_exceptions_update_pmtu(rt, arg->mtu);
3898                 spin_unlock_bh(&rt6_exception_lock);
3899         }
3900         return 0;
3901 }
3902
3903 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3904 {
3905         struct rt6_mtu_change_arg arg = {
3906                 .dev = dev,
3907                 .mtu = mtu,
3908         };
3909
3910         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3911 }
3912
3913 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3914         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3915         [RTA_OIF]               = { .type = NLA_U32 },
3916         [RTA_IIF]               = { .type = NLA_U32 },
3917         [RTA_PRIORITY]          = { .type = NLA_U32 },
3918         [RTA_METRICS]           = { .type = NLA_NESTED },
3919         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
3920         [RTA_PREF]              = { .type = NLA_U8 },
3921         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
3922         [RTA_ENCAP]             = { .type = NLA_NESTED },
3923         [RTA_EXPIRES]           = { .type = NLA_U32 },
3924         [RTA_UID]               = { .type = NLA_U32 },
3925         [RTA_MARK]              = { .type = NLA_U32 },
3926 };
3927
3928 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3929                               struct fib6_config *cfg,
3930                               struct netlink_ext_ack *extack)
3931 {
3932         struct rtmsg *rtm;
3933         struct nlattr *tb[RTA_MAX+1];
3934         unsigned int pref;
3935         int err;
3936
3937         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3938                           NULL);
3939         if (err < 0)
3940                 goto errout;
3941
3942         err = -EINVAL;
3943         rtm = nlmsg_data(nlh);
3944         memset(cfg, 0, sizeof(*cfg));
3945
3946         cfg->fc_table = rtm->rtm_table;
3947         cfg->fc_dst_len = rtm->rtm_dst_len;
3948         cfg->fc_src_len = rtm->rtm_src_len;
3949         cfg->fc_flags = RTF_UP;
3950         cfg->fc_protocol = rtm->rtm_protocol;
3951         cfg->fc_type = rtm->rtm_type;
3952
3953         if (rtm->rtm_type == RTN_UNREACHABLE ||
3954             rtm->rtm_type == RTN_BLACKHOLE ||
3955             rtm->rtm_type == RTN_PROHIBIT ||
3956             rtm->rtm_type == RTN_THROW)
3957                 cfg->fc_flags |= RTF_REJECT;
3958
3959         if (rtm->rtm_type == RTN_LOCAL)
3960                 cfg->fc_flags |= RTF_LOCAL;
3961
3962         if (rtm->rtm_flags & RTM_F_CLONED)
3963                 cfg->fc_flags |= RTF_CACHE;
3964
3965         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
3966
3967         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3968         cfg->fc_nlinfo.nlh = nlh;
3969         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3970
3971         if (tb[RTA_GATEWAY]) {
3972                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3973                 cfg->fc_flags |= RTF_GATEWAY;
3974         }
3975
3976         if (tb[RTA_DST]) {
3977                 int plen = (rtm->rtm_dst_len + 7) >> 3;
3978
3979                 if (nla_len(tb[RTA_DST]) < plen)
3980                         goto errout;
3981
3982                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3983         }
3984
3985         if (tb[RTA_SRC]) {
3986                 int plen = (rtm->rtm_src_len + 7) >> 3;
3987
3988                 if (nla_len(tb[RTA_SRC]) < plen)
3989                         goto errout;
3990
3991                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3992         }
3993
3994         if (tb[RTA_PREFSRC])
3995                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3996
3997         if (tb[RTA_OIF])
3998                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3999
4000         if (tb[RTA_PRIORITY])
4001                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4002
4003         if (tb[RTA_METRICS]) {
4004                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4005                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4006         }
4007
4008         if (tb[RTA_TABLE])
4009                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4010
4011         if (tb[RTA_MULTIPATH]) {
4012                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4013                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4014
4015                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4016                                                      cfg->fc_mp_len, extack);
4017                 if (err < 0)
4018                         goto errout;
4019         }
4020
4021         if (tb[RTA_PREF]) {
4022                 pref = nla_get_u8(tb[RTA_PREF]);
4023                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4024                     pref != ICMPV6_ROUTER_PREF_HIGH)
4025                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4026                 cfg->fc_flags |= RTF_PREF(pref);
4027         }
4028
4029         if (tb[RTA_ENCAP])
4030                 cfg->fc_encap = tb[RTA_ENCAP];
4031
4032         if (tb[RTA_ENCAP_TYPE]) {
4033                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4034
4035                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4036                 if (err < 0)
4037                         goto errout;
4038         }
4039
4040         if (tb[RTA_EXPIRES]) {
4041                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4042
4043                 if (addrconf_finite_timeout(timeout)) {
4044                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4045                         cfg->fc_flags |= RTF_EXPIRES;
4046                 }
4047         }
4048
4049         err = 0;
4050 errout:
4051         return err;
4052 }
4053
4054 struct rt6_nh {
4055         struct rt6_info *rt6_info;
4056         struct fib6_config r_cfg;
4057         struct mx6_config mxc;
4058         struct list_head next;
4059 };
4060
4061 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4062 {
4063         struct rt6_nh *nh;
4064
4065         list_for_each_entry(nh, rt6_nh_list, next) {
4066                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4067                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4068                         nh->r_cfg.fc_ifindex);
4069         }
4070 }
4071
4072 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4073                                  struct rt6_info *rt, struct fib6_config *r_cfg)
4074 {
4075         struct rt6_nh *nh;
4076         int err = -EEXIST;
4077
4078         list_for_each_entry(nh, rt6_nh_list, next) {
4079                 /* check if rt6_info already exists */
4080                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4081                         return err;
4082         }
4083
4084         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4085         if (!nh)
4086                 return -ENOMEM;
4087         nh->rt6_info = rt;
4088         err = ip6_convert_metrics(&nh->mxc, r_cfg);
4089         if (err) {
4090                 kfree(nh);
4091                 return err;
4092         }
4093         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4094         list_add_tail(&nh->next, rt6_nh_list);
4095
4096         return 0;
4097 }
4098
4099 static void ip6_route_mpath_notify(struct rt6_info *rt,
4100                                    struct rt6_info *rt_last,
4101                                    struct nl_info *info,
4102                                    __u16 nlflags)
4103 {
4104         /* if this is an APPEND route, then rt points to the first route
4105          * inserted and rt_last points to last route inserted. Userspace
4106          * wants a consistent dump of the route which starts at the first
4107          * nexthop. Since sibling routes are always added at the end of
4108          * the list, find the first sibling of the last route appended
4109          */
4110         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4111                 rt = list_first_entry(&rt_last->rt6i_siblings,
4112                                       struct rt6_info,
4113                                       rt6i_siblings);
4114         }
4115
4116         if (rt)
4117                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4118 }
4119
4120 static int ip6_route_multipath_add(struct fib6_config *cfg,
4121                                    struct netlink_ext_ack *extack)
4122 {
4123         struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4124         struct nl_info *info = &cfg->fc_nlinfo;
4125         struct fib6_config r_cfg;
4126         struct rtnexthop *rtnh;
4127         struct rt6_info *rt;
4128         struct rt6_nh *err_nh;
4129         struct rt6_nh *nh, *nh_safe;
4130         __u16 nlflags;
4131         int remaining;
4132         int attrlen;
4133         int err = 1;
4134         int nhn = 0;
4135         int replace = (cfg->fc_nlinfo.nlh &&
4136                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4137         LIST_HEAD(rt6_nh_list);
4138
4139         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4140         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4141                 nlflags |= NLM_F_APPEND;
4142
4143         remaining = cfg->fc_mp_len;
4144         rtnh = (struct rtnexthop *)cfg->fc_mp;
4145
4146         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4147          * rt6_info structs per nexthop
4148          */
4149         while (rtnh_ok(rtnh, remaining)) {
4150                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4151                 if (rtnh->rtnh_ifindex)
4152                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4153
4154                 attrlen = rtnh_attrlen(rtnh);
4155                 if (attrlen > 0) {
4156                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4157
4158                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4159                         if (nla) {
4160                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4161                                 r_cfg.fc_flags |= RTF_GATEWAY;
4162                         }
4163                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4164                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4165                         if (nla)
4166                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4167                 }
4168
4169                 rt = ip6_route_info_create(&r_cfg, extack);
4170                 if (IS_ERR(rt)) {
4171                         err = PTR_ERR(rt);
4172                         rt = NULL;
4173                         goto cleanup;
4174                 }
4175
4176                 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4177
4178                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4179                 if (err) {
4180                         dst_release_immediate(&rt->dst);
4181                         goto cleanup;
4182                 }
4183
4184                 rtnh = rtnh_next(rtnh, &remaining);
4185         }
4186
4187         /* for add and replace send one notification with all nexthops.
4188          * Skip the notification in fib6_add_rt2node and send one with
4189          * the full route when done
4190          */
4191         info->skip_notify = 1;
4192
4193         err_nh = NULL;
4194         list_for_each_entry(nh, &rt6_nh_list, next) {
4195                 rt_last = nh->rt6_info;
4196                 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4197                 /* save reference to first route for notification */
4198                 if (!rt_notif && !err)
4199                         rt_notif = nh->rt6_info;
4200
4201                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4202                 nh->rt6_info = NULL;
4203                 if (err) {
4204                         if (replace && nhn)
4205                                 ip6_print_replace_route_err(&rt6_nh_list);
4206                         err_nh = nh;
4207                         goto add_errout;
4208                 }
4209
4210                 /* Because each route is added like a single route we remove
4211                  * these flags after the first nexthop: if there is a collision,
4212                  * we have already failed to add the first nexthop:
4213                  * fib6_add_rt2node() has rejected it; when replacing, old
4214                  * nexthops have been replaced by first new, the rest should
4215                  * be added to it.
4216                  */
4217                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4218                                                      NLM_F_REPLACE);
4219                 nhn++;
4220         }
4221
4222         /* success ... tell user about new route */
4223         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4224         goto cleanup;
4225
4226 add_errout:
4227         /* send notification for routes that were added so that
4228          * the delete notifications sent by ip6_route_del are
4229          * coherent
4230          */
4231         if (rt_notif)
4232                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4233
4234         /* Delete routes that were already added */
4235         list_for_each_entry(nh, &rt6_nh_list, next) {
4236                 if (err_nh == nh)
4237                         break;
4238                 ip6_route_del(&nh->r_cfg, extack);
4239         }
4240
4241 cleanup:
4242         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4243                 if (nh->rt6_info)
4244                         dst_release_immediate(&nh->rt6_info->dst);
4245                 kfree(nh->mxc.mx);
4246                 list_del(&nh->next);
4247                 kfree(nh);
4248         }
4249
4250         return err;
4251 }
4252
4253 static int ip6_route_multipath_del(struct fib6_config *cfg,
4254                                    struct netlink_ext_ack *extack)
4255 {
4256         struct fib6_config r_cfg;
4257         struct rtnexthop *rtnh;
4258         int remaining;
4259         int attrlen;
4260         int err = 1, last_err = 0;
4261
4262         remaining = cfg->fc_mp_len;
4263         rtnh = (struct rtnexthop *)cfg->fc_mp;
4264
4265         /* Parse a Multipath Entry */
4266         while (rtnh_ok(rtnh, remaining)) {
4267                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4268                 if (rtnh->rtnh_ifindex)
4269                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4270
4271                 attrlen = rtnh_attrlen(rtnh);
4272                 if (attrlen > 0) {
4273                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4274
4275                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4276                         if (nla) {
4277                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4278                                 r_cfg.fc_flags |= RTF_GATEWAY;
4279                         }
4280                 }
4281                 err = ip6_route_del(&r_cfg, extack);
4282                 if (err)
4283                         last_err = err;
4284
4285                 rtnh = rtnh_next(rtnh, &remaining);
4286         }
4287
4288         return last_err;
4289 }
4290
4291 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4292                               struct netlink_ext_ack *extack)
4293 {
4294         struct fib6_config cfg;
4295         int err;
4296
4297         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4298         if (err < 0)
4299                 return err;
4300
4301         if (cfg.fc_mp)
4302                 return ip6_route_multipath_del(&cfg, extack);
4303         else {
4304                 cfg.fc_delete_all_nh = 1;
4305                 return ip6_route_del(&cfg, extack);
4306         }
4307 }
4308
4309 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4310                               struct netlink_ext_ack *extack)
4311 {
4312         struct fib6_config cfg;
4313         int err;
4314
4315         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4316         if (err < 0)
4317                 return err;
4318
4319         if (cfg.fc_mp)
4320                 return ip6_route_multipath_add(&cfg, extack);
4321         else
4322                 return ip6_route_add(&cfg, extack);
4323 }
4324
4325 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4326 {
4327         int nexthop_len = 0;
4328
4329         if (rt->rt6i_nsiblings) {
4330                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4331                             + NLA_ALIGN(sizeof(struct rtnexthop))
4332                             + nla_total_size(16) /* RTA_GATEWAY */
4333                             + lwtunnel_get_encap_size(rt->dst.lwtstate);
4334
4335                 nexthop_len *= rt->rt6i_nsiblings;
4336         }
4337
4338         return NLMSG_ALIGN(sizeof(struct rtmsg))
4339                + nla_total_size(16) /* RTA_SRC */
4340                + nla_total_size(16) /* RTA_DST */
4341                + nla_total_size(16) /* RTA_GATEWAY */
4342                + nla_total_size(16) /* RTA_PREFSRC */
4343                + nla_total_size(4) /* RTA_TABLE */
4344                + nla_total_size(4) /* RTA_IIF */
4345                + nla_total_size(4) /* RTA_OIF */
4346                + nla_total_size(4) /* RTA_PRIORITY */
4347                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4348                + nla_total_size(sizeof(struct rta_cacheinfo))
4349                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4350                + nla_total_size(1) /* RTA_PREF */
4351                + lwtunnel_get_encap_size(rt->dst.lwtstate)
4352                + nexthop_len;
4353 }
4354
4355 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4356                             unsigned int *flags, bool skip_oif)
4357 {
4358         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4359                 *flags |= RTNH_F_DEAD;
4360
4361         if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4362                 *flags |= RTNH_F_LINKDOWN;
4363                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4364                         *flags |= RTNH_F_DEAD;
4365         }
4366
4367         if (rt->rt6i_flags & RTF_GATEWAY) {
4368                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4369                         goto nla_put_failure;
4370         }
4371
4372         *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4373         if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4374                 *flags |= RTNH_F_OFFLOAD;
4375
4376         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4377         if (!skip_oif && rt->dst.dev &&
4378             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4379                 goto nla_put_failure;
4380
4381         if (rt->dst.lwtstate &&
4382             lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4383                 goto nla_put_failure;
4384
4385         return 0;
4386
4387 nla_put_failure:
4388         return -EMSGSIZE;
4389 }
4390
4391 /* add multipath next hop */
4392 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4393 {
4394         struct rtnexthop *rtnh;
4395         unsigned int flags = 0;
4396
4397         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4398         if (!rtnh)
4399                 goto nla_put_failure;
4400
4401         rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4402         rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4403
4404         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4405                 goto nla_put_failure;
4406
4407         rtnh->rtnh_flags = flags;
4408
4409         /* length of rtnetlink header + attributes */
4410         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4411
4412         return 0;
4413
4414 nla_put_failure:
4415         return -EMSGSIZE;
4416 }
4417
4418 static int rt6_fill_node(struct net *net,
4419                          struct sk_buff *skb, struct rt6_info *rt,
4420                          struct in6_addr *dst, struct in6_addr *src,
4421                          int iif, int type, u32 portid, u32 seq,
4422                          unsigned int flags)
4423 {
4424         u32 metrics[RTAX_MAX];
4425         struct rtmsg *rtm;
4426         struct nlmsghdr *nlh;
4427         long expires;
4428         u32 table;
4429
4430         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4431         if (!nlh)
4432                 return -EMSGSIZE;
4433
4434         rtm = nlmsg_data(nlh);
4435         rtm->rtm_family = AF_INET6;
4436         rtm->rtm_dst_len = rt->rt6i_dst.plen;
4437         rtm->rtm_src_len = rt->rt6i_src.plen;
4438         rtm->rtm_tos = 0;
4439         if (rt->rt6i_table)
4440                 table = rt->rt6i_table->tb6_id;
4441         else
4442                 table = RT6_TABLE_UNSPEC;
4443         rtm->rtm_table = table;
4444         if (nla_put_u32(skb, RTA_TABLE, table))
4445                 goto nla_put_failure;
4446         if (rt->rt6i_flags & RTF_REJECT) {
4447                 switch (rt->dst.error) {
4448                 case -EINVAL:
4449                         rtm->rtm_type = RTN_BLACKHOLE;
4450                         break;
4451                 case -EACCES:
4452                         rtm->rtm_type = RTN_PROHIBIT;
4453                         break;
4454                 case -EAGAIN:
4455                         rtm->rtm_type = RTN_THROW;
4456                         break;
4457                 default:
4458                         rtm->rtm_type = RTN_UNREACHABLE;
4459                         break;
4460                 }
4461         }
4462         else if (rt->rt6i_flags & RTF_LOCAL)
4463                 rtm->rtm_type = RTN_LOCAL;
4464         else if (rt->rt6i_flags & RTF_ANYCAST)
4465                 rtm->rtm_type = RTN_ANYCAST;
4466         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4467                 rtm->rtm_type = RTN_LOCAL;
4468         else
4469                 rtm->rtm_type = RTN_UNICAST;
4470         rtm->rtm_flags = 0;
4471         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4472         rtm->rtm_protocol = rt->rt6i_protocol;
4473
4474         if (rt->rt6i_flags & RTF_CACHE)
4475                 rtm->rtm_flags |= RTM_F_CLONED;
4476
4477         if (dst) {
4478                 if (nla_put_in6_addr(skb, RTA_DST, dst))
4479                         goto nla_put_failure;
4480                 rtm->rtm_dst_len = 128;
4481         } else if (rtm->rtm_dst_len)
4482                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4483                         goto nla_put_failure;
4484 #ifdef CONFIG_IPV6_SUBTREES
4485         if (src) {
4486                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4487                         goto nla_put_failure;
4488                 rtm->rtm_src_len = 128;
4489         } else if (rtm->rtm_src_len &&
4490                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4491                 goto nla_put_failure;
4492 #endif
4493         if (iif) {
4494 #ifdef CONFIG_IPV6_MROUTE
4495                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4496                         int err = ip6mr_get_route(net, skb, rtm, portid);
4497
4498                         if (err == 0)
4499                                 return 0;
4500                         if (err < 0)
4501                                 goto nla_put_failure;
4502                 } else
4503 #endif
4504                         if (nla_put_u32(skb, RTA_IIF, iif))
4505                                 goto nla_put_failure;
4506         } else if (dst) {
4507                 struct in6_addr saddr_buf;
4508                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4509                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4510                         goto nla_put_failure;
4511         }
4512
4513         if (rt->rt6i_prefsrc.plen) {
4514                 struct in6_addr saddr_buf;
4515                 saddr_buf = rt->rt6i_prefsrc.addr;
4516                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4517                         goto nla_put_failure;
4518         }
4519
4520         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4521         if (rt->rt6i_pmtu)
4522                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4523         if (rtnetlink_put_metrics(skb, metrics) < 0)
4524                 goto nla_put_failure;
4525
4526         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4527                 goto nla_put_failure;
4528
4529         /* For multipath routes, walk the siblings list and add
4530          * each as a nexthop within RTA_MULTIPATH.
4531          */
4532         if (rt->rt6i_nsiblings) {
4533                 struct rt6_info *sibling, *next_sibling;
4534                 struct nlattr *mp;
4535
4536                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4537                 if (!mp)
4538                         goto nla_put_failure;
4539
4540                 if (rt6_add_nexthop(skb, rt) < 0)
4541                         goto nla_put_failure;
4542
4543                 list_for_each_entry_safe(sibling, next_sibling,
4544                                          &rt->rt6i_siblings, rt6i_siblings) {
4545                         if (rt6_add_nexthop(skb, sibling) < 0)
4546                                 goto nla_put_failure;
4547                 }
4548
4549                 nla_nest_end(skb, mp);
4550         } else {
4551                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4552                         goto nla_put_failure;
4553         }
4554
4555         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4556
4557         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4558                 goto nla_put_failure;
4559
4560         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4561                 goto nla_put_failure;
4562
4563
4564         nlmsg_end(skb, nlh);
4565         return 0;
4566
4567 nla_put_failure:
4568         nlmsg_cancel(skb, nlh);
4569         return -EMSGSIZE;
4570 }
4571
4572 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4573 {
4574         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4575         struct net *net = arg->net;
4576
4577         if (rt == net->ipv6.ip6_null_entry)
4578                 return 0;
4579
4580         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4581                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4582
4583                 /* user wants prefix routes only */
4584                 if (rtm->rtm_flags & RTM_F_PREFIX &&
4585                     !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4586                         /* success since this is not a prefix route */
4587                         return 1;
4588                 }
4589         }
4590
4591         return rt6_fill_node(net,
4592                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4593                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4594                      NLM_F_MULTI);
4595 }
4596
4597 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4598                               struct netlink_ext_ack *extack)
4599 {
4600         struct net *net = sock_net(in_skb->sk);
4601         struct nlattr *tb[RTA_MAX+1];
4602         int err, iif = 0, oif = 0;
4603         struct dst_entry *dst;
4604         struct rt6_info *rt;
4605         struct sk_buff *skb;
4606         struct rtmsg *rtm;
4607         struct flowi6 fl6;
4608         bool fibmatch;
4609
4610         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4611                           extack);
4612         if (err < 0)
4613                 goto errout;
4614
4615         err = -EINVAL;
4616         memset(&fl6, 0, sizeof(fl6));
4617         rtm = nlmsg_data(nlh);
4618         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4619         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4620
4621         if (tb[RTA_SRC]) {
4622                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4623                         goto errout;
4624
4625                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4626         }
4627
4628         if (tb[RTA_DST]) {
4629                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4630                         goto errout;
4631
4632                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4633         }
4634
4635         if (tb[RTA_IIF])
4636                 iif = nla_get_u32(tb[RTA_IIF]);
4637
4638         if (tb[RTA_OIF])
4639                 oif = nla_get_u32(tb[RTA_OIF]);
4640
4641         if (tb[RTA_MARK])
4642                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4643
4644         if (tb[RTA_UID])
4645                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4646                                            nla_get_u32(tb[RTA_UID]));
4647         else
4648                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4649
4650         if (iif) {
4651                 struct net_device *dev;
4652                 int flags = 0;
4653
4654                 rcu_read_lock();
4655
4656                 dev = dev_get_by_index_rcu(net, iif);
4657                 if (!dev) {
4658                         rcu_read_unlock();
4659                         err = -ENODEV;
4660                         goto errout;
4661                 }
4662
4663                 fl6.flowi6_iif = iif;
4664
4665                 if (!ipv6_addr_any(&fl6.saddr))
4666                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4667
4668                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4669
4670                 rcu_read_unlock();
4671         } else {
4672                 fl6.flowi6_oif = oif;
4673
4674                 dst = ip6_route_output(net, NULL, &fl6);
4675         }
4676
4677
4678         rt = container_of(dst, struct rt6_info, dst);
4679         if (rt->dst.error) {
4680                 err = rt->dst.error;
4681                 ip6_rt_put(rt);
4682                 goto errout;
4683         }
4684
4685         if (rt == net->ipv6.ip6_null_entry) {
4686                 err = rt->dst.error;
4687                 ip6_rt_put(rt);
4688                 goto errout;
4689         }
4690
4691         if (fibmatch && rt->from) {
4692                 struct rt6_info *ort = rt->from;
4693
4694                 dst_hold(&ort->dst);
4695                 ip6_rt_put(rt);
4696                 rt = ort;
4697         }
4698
4699         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4700         if (!skb) {
4701                 ip6_rt_put(rt);
4702                 err = -ENOBUFS;
4703                 goto errout;
4704         }
4705
4706         skb_dst_set(skb, &rt->dst);
4707         if (fibmatch)
4708                 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4709                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4710                                     nlh->nlmsg_seq, 0);
4711         else
4712                 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4713                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4714                                     nlh->nlmsg_seq, 0);
4715         if (err < 0) {
4716                 kfree_skb(skb);
4717                 goto errout;
4718         }
4719
4720         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4721 errout:
4722         return err;
4723 }
4724
4725 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4726                      unsigned int nlm_flags)
4727 {
4728         struct sk_buff *skb;
4729         struct net *net = info->nl_net;
4730         u32 seq;
4731         int err;
4732
4733         err = -ENOBUFS;
4734         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4735
4736         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4737         if (!skb)
4738                 goto errout;
4739
4740         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4741                                 event, info->portid, seq, nlm_flags);
4742         if (err < 0) {
4743                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4744                 WARN_ON(err == -EMSGSIZE);
4745                 kfree_skb(skb);
4746                 goto errout;
4747         }
4748         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4749                     info->nlh, gfp_any());
4750         return;
4751 errout:
4752         if (err < 0)
4753                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4754 }
4755
4756 static int ip6_route_dev_notify(struct notifier_block *this,
4757                                 unsigned long event, void *ptr)
4758 {
4759         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4760         struct net *net = dev_net(dev);
4761
4762         if (!(dev->flags & IFF_LOOPBACK))
4763                 return NOTIFY_OK;
4764
4765         if (event == NETDEV_REGISTER) {
4766                 net->ipv6.ip6_null_entry->dst.dev = dev;
4767                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4768 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4769                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4770                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4771                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4772                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4773 #endif
4774          } else if (event == NETDEV_UNREGISTER &&
4775                     dev->reg_state != NETREG_UNREGISTERED) {
4776                 /* NETDEV_UNREGISTER could be fired for multiple times by
4777                  * netdev_wait_allrefs(). Make sure we only call this once.
4778                  */
4779                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4780 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4781                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4782                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4783 #endif
4784         }
4785
4786         return NOTIFY_OK;
4787 }
4788
4789 /*
4790  *      /proc
4791  */
4792
4793 #ifdef CONFIG_PROC_FS
4794
4795 static const struct file_operations ipv6_route_proc_fops = {
4796         .open           = ipv6_route_open,
4797         .read           = seq_read,
4798         .llseek         = seq_lseek,
4799         .release        = seq_release_net,
4800 };
4801
4802 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4803 {
4804         struct net *net = (struct net *)seq->private;
4805         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4806                    net->ipv6.rt6_stats->fib_nodes,
4807                    net->ipv6.rt6_stats->fib_route_nodes,
4808                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4809                    net->ipv6.rt6_stats->fib_rt_entries,
4810                    net->ipv6.rt6_stats->fib_rt_cache,
4811                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4812                    net->ipv6.rt6_stats->fib_discarded_routes);
4813
4814         return 0;
4815 }
4816
4817 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4818 {
4819         return single_open_net(inode, file, rt6_stats_seq_show);
4820 }
4821
4822 static const struct file_operations rt6_stats_seq_fops = {
4823         .open    = rt6_stats_seq_open,
4824         .read    = seq_read,
4825         .llseek  = seq_lseek,
4826         .release = single_release_net,
4827 };
4828 #endif  /* CONFIG_PROC_FS */
4829
4830 #ifdef CONFIG_SYSCTL
4831
4832 static
4833 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4834                               void __user *buffer, size_t *lenp, loff_t *ppos)
4835 {
4836         struct net *net;
4837         int delay;
4838         if (!write)
4839                 return -EINVAL;
4840
4841         net = (struct net *)ctl->extra1;
4842         delay = net->ipv6.sysctl.flush_delay;
4843         proc_dointvec(ctl, write, buffer, lenp, ppos);
4844         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4845         return 0;
4846 }
4847
4848 struct ctl_table ipv6_route_table_template[] = {
4849         {
4850                 .procname       =       "flush",
4851                 .data           =       &init_net.ipv6.sysctl.flush_delay,
4852                 .maxlen         =       sizeof(int),
4853                 .mode           =       0200,
4854                 .proc_handler   =       ipv6_sysctl_rtcache_flush
4855         },
4856         {
4857                 .procname       =       "gc_thresh",
4858                 .data           =       &ip6_dst_ops_template.gc_thresh,
4859                 .maxlen         =       sizeof(int),
4860                 .mode           =       0644,
4861                 .proc_handler   =       proc_dointvec,
4862         },
4863         {
4864                 .procname       =       "max_size",
4865                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
4866                 .maxlen         =       sizeof(int),
4867                 .mode           =       0644,
4868                 .proc_handler   =       proc_dointvec,
4869         },
4870         {
4871                 .procname       =       "gc_min_interval",
4872                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4873                 .maxlen         =       sizeof(int),
4874                 .mode           =       0644,
4875                 .proc_handler   =       proc_dointvec_jiffies,
4876         },
4877         {
4878                 .procname       =       "gc_timeout",
4879                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4880                 .maxlen         =       sizeof(int),
4881                 .mode           =       0644,
4882                 .proc_handler   =       proc_dointvec_jiffies,
4883         },
4884         {
4885                 .procname       =       "gc_interval",
4886                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4887                 .maxlen         =       sizeof(int),
4888                 .mode           =       0644,
4889                 .proc_handler   =       proc_dointvec_jiffies,
4890         },
4891         {
4892                 .procname       =       "gc_elasticity",
4893                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4894                 .maxlen         =       sizeof(int),
4895                 .mode           =       0644,
4896                 .proc_handler   =       proc_dointvec,
4897         },
4898         {
4899                 .procname       =       "mtu_expires",
4900                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4901                 .maxlen         =       sizeof(int),
4902                 .mode           =       0644,
4903                 .proc_handler   =       proc_dointvec_jiffies,
4904         },
4905         {
4906                 .procname       =       "min_adv_mss",
4907                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4908                 .maxlen         =       sizeof(int),
4909                 .mode           =       0644,
4910                 .proc_handler   =       proc_dointvec,
4911         },
4912         {
4913                 .procname       =       "gc_min_interval_ms",
4914                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4915                 .maxlen         =       sizeof(int),
4916                 .mode           =       0644,
4917                 .proc_handler   =       proc_dointvec_ms_jiffies,
4918         },
4919         { }
4920 };
4921
4922 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4923 {
4924         struct ctl_table *table;
4925
4926         table = kmemdup(ipv6_route_table_template,
4927                         sizeof(ipv6_route_table_template),
4928                         GFP_KERNEL);
4929
4930         if (table) {
4931                 table[0].data = &net->ipv6.sysctl.flush_delay;
4932                 table[0].extra1 = net;
4933                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4934                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4935                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4936                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4937                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4938                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4939                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4940                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4941                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4942
4943                 /* Don't export sysctls to unprivileged users */
4944                 if (net->user_ns != &init_user_ns)
4945                         table[0].procname = NULL;
4946         }
4947
4948         return table;
4949 }
4950 #endif
4951
4952 static int __net_init ip6_route_net_init(struct net *net)
4953 {
4954         int ret = -ENOMEM;
4955
4956         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4957                sizeof(net->ipv6.ip6_dst_ops));
4958
4959         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4960                 goto out_ip6_dst_ops;
4961
4962         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4963                                            sizeof(*net->ipv6.ip6_null_entry),
4964                                            GFP_KERNEL);
4965         if (!net->ipv6.ip6_null_entry)
4966                 goto out_ip6_dst_entries;
4967         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4968         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4969                          ip6_template_metrics, true);
4970
4971 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4972         net->ipv6.fib6_has_custom_rules = false;
4973         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4974                                                sizeof(*net->ipv6.ip6_prohibit_entry),
4975                                                GFP_KERNEL);
4976         if (!net->ipv6.ip6_prohibit_entry)
4977                 goto out_ip6_null_entry;
4978         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4979         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4980                          ip6_template_metrics, true);
4981
4982         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4983                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
4984                                                GFP_KERNEL);
4985         if (!net->ipv6.ip6_blk_hole_entry)
4986                 goto out_ip6_prohibit_entry;
4987         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4988         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4989                          ip6_template_metrics, true);
4990 #endif
4991
4992         net->ipv6.sysctl.flush_delay = 0;
4993         net->ipv6.sysctl.ip6_rt_max_size = 4096;
4994         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4995         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4996         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4997         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4998         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4999         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5000
5001         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5002
5003         ret = 0;
5004 out:
5005         return ret;
5006
5007 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5008 out_ip6_prohibit_entry:
5009         kfree(net->ipv6.ip6_prohibit_entry);
5010 out_ip6_null_entry:
5011         kfree(net->ipv6.ip6_null_entry);
5012 #endif
5013 out_ip6_dst_entries:
5014         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5015 out_ip6_dst_ops:
5016         goto out;
5017 }
5018
5019 static void __net_exit ip6_route_net_exit(struct net *net)
5020 {
5021         kfree(net->ipv6.ip6_null_entry);
5022 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5023         kfree(net->ipv6.ip6_prohibit_entry);
5024         kfree(net->ipv6.ip6_blk_hole_entry);
5025 #endif
5026         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5027 }
5028
5029 static int __net_init ip6_route_net_init_late(struct net *net)
5030 {
5031 #ifdef CONFIG_PROC_FS
5032         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5033         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
5034 #endif
5035         return 0;
5036 }
5037
5038 static void __net_exit ip6_route_net_exit_late(struct net *net)
5039 {
5040 #ifdef CONFIG_PROC_FS
5041         remove_proc_entry("ipv6_route", net->proc_net);
5042         remove_proc_entry("rt6_stats", net->proc_net);
5043 #endif
5044 }
5045
5046 static struct pernet_operations ip6_route_net_ops = {
5047         .init = ip6_route_net_init,
5048         .exit = ip6_route_net_exit,
5049         .async = true,
5050 };
5051
5052 static int __net_init ipv6_inetpeer_init(struct net *net)
5053 {
5054         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5055
5056         if (!bp)
5057                 return -ENOMEM;
5058         inet_peer_base_init(bp);
5059         net->ipv6.peers = bp;
5060         return 0;
5061 }
5062
5063 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5064 {
5065         struct inet_peer_base *bp = net->ipv6.peers;
5066
5067         net->ipv6.peers = NULL;
5068         inetpeer_invalidate_tree(bp);
5069         kfree(bp);
5070 }
5071
5072 static struct pernet_operations ipv6_inetpeer_ops = {
5073         .init   =       ipv6_inetpeer_init,
5074         .exit   =       ipv6_inetpeer_exit,
5075         .async  =       true,
5076 };
5077
5078 static struct pernet_operations ip6_route_net_late_ops = {
5079         .init = ip6_route_net_init_late,
5080         .exit = ip6_route_net_exit_late,
5081         .async = true,
5082 };
5083
5084 static struct notifier_block ip6_route_dev_notifier = {
5085         .notifier_call = ip6_route_dev_notify,
5086         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5087 };
5088
5089 void __init ip6_route_init_special_entries(void)
5090 {
5091         /* Registering of the loopback is done before this portion of code,
5092          * the loopback reference in rt6_info will not be taken, do it
5093          * manually for init_net */
5094         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5095         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5096   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5097         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5098         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5099         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5100         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5101   #endif
5102 }
5103
5104 int __init ip6_route_init(void)
5105 {
5106         int ret;
5107         int cpu;
5108
5109         ret = -ENOMEM;
5110         ip6_dst_ops_template.kmem_cachep =
5111                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5112                                   SLAB_HWCACHE_ALIGN, NULL);
5113         if (!ip6_dst_ops_template.kmem_cachep)
5114                 goto out;
5115
5116         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5117         if (ret)
5118                 goto out_kmem_cache;
5119
5120         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5121         if (ret)
5122                 goto out_dst_entries;
5123
5124         ret = register_pernet_subsys(&ip6_route_net_ops);
5125         if (ret)
5126                 goto out_register_inetpeer;
5127
5128         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5129
5130         ret = fib6_init();
5131         if (ret)
5132                 goto out_register_subsys;
5133
5134         ret = xfrm6_init();
5135         if (ret)
5136                 goto out_fib6_init;
5137
5138         ret = fib6_rules_init();
5139         if (ret)
5140                 goto xfrm6_init;
5141
5142         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5143         if (ret)
5144                 goto fib6_rules_init;
5145
5146         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5147                                    inet6_rtm_newroute, NULL, 0);
5148         if (ret < 0)
5149                 goto out_register_late_subsys;
5150
5151         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5152                                    inet6_rtm_delroute, NULL, 0);
5153         if (ret < 0)
5154                 goto out_register_late_subsys;
5155
5156         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5157                                    inet6_rtm_getroute, NULL,
5158                                    RTNL_FLAG_DOIT_UNLOCKED);
5159         if (ret < 0)
5160                 goto out_register_late_subsys;
5161
5162         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5163         if (ret)
5164                 goto out_register_late_subsys;
5165
5166         for_each_possible_cpu(cpu) {
5167                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5168
5169                 INIT_LIST_HEAD(&ul->head);
5170                 spin_lock_init(&ul->lock);
5171         }
5172
5173 out:
5174         return ret;
5175
5176 out_register_late_subsys:
5177         rtnl_unregister_all(PF_INET6);
5178         unregister_pernet_subsys(&ip6_route_net_late_ops);
5179 fib6_rules_init:
5180         fib6_rules_cleanup();
5181 xfrm6_init:
5182         xfrm6_fini();
5183 out_fib6_init:
5184         fib6_gc_cleanup();
5185 out_register_subsys:
5186         unregister_pernet_subsys(&ip6_route_net_ops);
5187 out_register_inetpeer:
5188         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5189 out_dst_entries:
5190         dst_entries_destroy(&ip6_dst_blackhole_ops);
5191 out_kmem_cache:
5192         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5193         goto out;
5194 }
5195
5196 void ip6_route_cleanup(void)
5197 {
5198         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5199         unregister_pernet_subsys(&ip6_route_net_late_ops);
5200         fib6_rules_cleanup();
5201         xfrm6_fini();
5202         fib6_gc_cleanup();
5203         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5204         unregister_pernet_subsys(&ip6_route_net_ops);
5205         dst_entries_destroy(&ip6_dst_blackhole_ops);
5206         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5207 }
This page took 0.33236 seconds and 4 git commands to generate.