]> Git Repo - linux.git/blob - net/ipv6/route.c
mmc: core: do not retry CMD6 in __mmc_switch()
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <[email protected]>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213
214         n = neigh_create(&nd_tbl, daddr, dev);
215         return IS_ERR(n) ? NULL : n;
216 }
217
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219                                               struct sk_buff *skb,
220                                               const void *daddr)
221 {
222         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223
224         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229         struct net_device *dev = dst->dev;
230         struct rt6_info *rt = (struct rt6_info *)dst;
231
232         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233         if (!daddr)
234                 return;
235         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236                 return;
237         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238                 return;
239         __ipv6_confirm_neigh(dev, daddr);
240 }
241
242 static struct dst_ops ip6_dst_ops_template = {
243         .family                 =       AF_INET6,
244         .gc                     =       ip6_dst_gc,
245         .gc_thresh              =       1024,
246         .check                  =       ip6_dst_check,
247         .default_advmss         =       ip6_default_advmss,
248         .mtu                    =       ip6_mtu,
249         .cow_metrics            =       dst_cow_metrics_generic,
250         .destroy                =       ip6_dst_destroy,
251         .ifdown                 =       ip6_dst_ifdown,
252         .negative_advice        =       ip6_negative_advice,
253         .link_failure           =       ip6_link_failure,
254         .update_pmtu            =       ip6_rt_update_pmtu,
255         .redirect               =       rt6_do_redirect,
256         .local_out              =       __ip6_local_out,
257         .neigh_lookup           =       ip6_dst_neigh_lookup,
258         .confirm_neigh          =       ip6_confirm_neigh,
259 };
260
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264
265         return mtu ? : dst->dev->mtu;
266 }
267
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269                                          struct sk_buff *skb, u32 mtu)
270 {
271 }
272
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274                                       struct sk_buff *skb)
275 {
276 }
277
278 static struct dst_ops ip6_dst_blackhole_ops = {
279         .family                 =       AF_INET6,
280         .destroy                =       ip6_dst_destroy,
281         .check                  =       ip6_dst_check,
282         .mtu                    =       ip6_blackhole_mtu,
283         .default_advmss         =       ip6_default_advmss,
284         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
285         .redirect               =       ip6_rt_blackhole_redirect,
286         .cow_metrics            =       dst_cow_metrics_generic,
287         .neigh_lookup           =       ip6_dst_neigh_lookup,
288 };
289
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291         [RTAX_HOPLIMIT - 1] = 0,
292 };
293
294 static const struct fib6_info fib6_null_entry_template = {
295         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .fib6_protocol  = RTPROT_KERNEL,
297         .fib6_metric    = ~(u32)0,
298         .fib6_ref       = ATOMIC_INIT(1),
299         .fib6_type      = RTN_UNREACHABLE,
300         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
301 };
302
303 static const struct rt6_info ip6_null_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -ENETUNREACH,
309                 .input          = ip6_pkt_discard,
310                 .output         = ip6_pkt_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313 };
314
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316
317 static const struct rt6_info ip6_prohibit_entry_template = {
318         .dst = {
319                 .__refcnt       = ATOMIC_INIT(1),
320                 .__use          = 1,
321                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
322                 .error          = -EACCES,
323                 .input          = ip6_pkt_prohibit,
324                 .output         = ip6_pkt_prohibit_out,
325         },
326         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
327 };
328
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330         .dst = {
331                 .__refcnt       = ATOMIC_INIT(1),
332                 .__use          = 1,
333                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
334                 .error          = -EINVAL,
335                 .input          = dst_discard,
336                 .output         = dst_discard_out,
337         },
338         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
339 };
340
341 #endif
342
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345         struct dst_entry *dst = &rt->dst;
346
347         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348         INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353                                int flags)
354 {
355         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356                                         1, DST_OBSOLETE_FORCE_CHK, flags);
357
358         if (rt) {
359                 rt6_info_init(rt);
360                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361         }
362
363         return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369         struct rt6_info *rt = (struct rt6_info *)dst;
370         struct fib6_info *from;
371         struct inet6_dev *idev;
372
373         ip_dst_metrics_put(dst);
374         rt6_uncached_list_del(rt);
375
376         idev = rt->rt6i_idev;
377         if (idev) {
378                 rt->rt6i_idev = NULL;
379                 in6_dev_put(idev);
380         }
381
382         rcu_read_lock();
383         from = rcu_dereference(rt->from);
384         rcu_assign_pointer(rt->from, NULL);
385         fib6_info_release(from);
386         rcu_read_unlock();
387 }
388
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390                            int how)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct inet6_dev *idev = rt->rt6i_idev;
394         struct net_device *loopback_dev =
395                 dev_net(dev)->loopback_dev;
396
397         if (idev && idev->dev != loopback_dev) {
398                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
399                 if (loopback_idev) {
400                         rt->rt6i_idev = loopback_idev;
401                         in6_dev_put(idev);
402                 }
403         }
404 }
405
406 static bool __rt6_check_expired(const struct rt6_info *rt)
407 {
408         if (rt->rt6i_flags & RTF_EXPIRES)
409                 return time_after(jiffies, rt->dst.expires);
410         else
411                 return false;
412 }
413
414 static bool rt6_check_expired(const struct rt6_info *rt)
415 {
416         struct fib6_info *from;
417
418         from = rcu_dereference(rt->from);
419
420         if (rt->rt6i_flags & RTF_EXPIRES) {
421                 if (time_after(jiffies, rt->dst.expires))
422                         return true;
423         } else if (from) {
424                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425                         fib6_check_expired(from);
426         }
427         return false;
428 }
429
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431                                         struct fib6_info *match,
432                                         struct flowi6 *fl6, int oif,
433                                         const struct sk_buff *skb,
434                                         int strict)
435 {
436         struct fib6_info *sibling, *next_sibling;
437
438         /* We might have already computed the hash for ICMPv6 errors. In such
439          * case it will always be non-zero. Otherwise now is the time to do it.
440          */
441         if (!fl6->mp_hash)
442                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443
444         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445                 return match;
446
447         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448                                  fib6_siblings) {
449                 int nh_upper_bound;
450
451                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452                 if (fl6->mp_hash > nh_upper_bound)
453                         continue;
454                 if (rt6_score_route(sibling, oif, strict) < 0)
455                         break;
456                 match = sibling;
457                 break;
458         }
459
460         return match;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468                                                  struct fib6_info *rt,
469                                                     const struct in6_addr *saddr,
470                                                     int oif,
471                                                     int flags)
472 {
473         struct fib6_info *sprt;
474
475         if (!oif && ipv6_addr_any(saddr) &&
476             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477                 return rt;
478
479         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
481
482                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
483                         continue;
484
485                 if (oif) {
486                         if (dev->ifindex == oif)
487                                 return sprt;
488                 } else {
489                         if (ipv6_chk_addr(net, saddr, dev,
490                                           flags & RT6_LOOKUP_F_IFACE))
491                                 return sprt;
492                 }
493         }
494
495         if (oif && flags & RT6_LOOKUP_F_IFACE)
496                 return net->ipv6.fib6_null_entry;
497
498         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 }
500
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503         struct work_struct work;
504         struct in6_addr target;
505         struct net_device *dev;
506 };
507
508 static void rt6_probe_deferred(struct work_struct *w)
509 {
510         struct in6_addr mcaddr;
511         struct __rt6_probe_work *work =
512                 container_of(w, struct __rt6_probe_work, work);
513
514         addrconf_addr_solict_mult(&work->target, &mcaddr);
515         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516         dev_put(work->dev);
517         kfree(work);
518 }
519
520 static void rt6_probe(struct fib6_info *rt)
521 {
522         struct __rt6_probe_work *work = NULL;
523         const struct in6_addr *nh_gw;
524         struct neighbour *neigh;
525         struct net_device *dev;
526         struct inet6_dev *idev;
527
528         /*
529          * Okay, this does not seem to be appropriate
530          * for now, however, we need to check if it
531          * is really so; aka Router Reachability Probing.
532          *
533          * Router Reachability Probe MUST be rate-limited
534          * to no more than one per minute.
535          */
536         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
537                 return;
538
539         nh_gw = &rt->fib6_nh.nh_gw;
540         dev = rt->fib6_nh.nh_dev;
541         rcu_read_lock_bh();
542         idev = __in6_dev_get(dev);
543         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544         if (neigh) {
545                 if (neigh->nud_state & NUD_VALID)
546                         goto out;
547
548                 write_lock(&neigh->lock);
549                 if (!(neigh->nud_state & NUD_VALID) &&
550                     time_after(jiffies,
551                                neigh->updated + idev->cnf.rtr_probe_interval)) {
552                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
553                         if (work)
554                                 __neigh_set_probe_once(neigh);
555                 }
556                 write_unlock(&neigh->lock);
557         } else if (time_after(jiffies, rt->last_probe +
558                                        idev->cnf.rtr_probe_interval)) {
559                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
560         }
561
562         if (work) {
563                 rt->last_probe = jiffies;
564                 INIT_WORK(&work->work, rt6_probe_deferred);
565                 work->target = *nh_gw;
566                 dev_hold(dev);
567                 work->dev = dev;
568                 schedule_work(&work->work);
569         }
570
571 out:
572         rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585         const struct net_device *dev = rt->fib6_nh.nh_dev;
586
587         if (!oif || dev->ifindex == oif)
588                 return 2;
589         return 0;
590 }
591
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
593 {
594         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595         struct neighbour *neigh;
596
597         if (rt->fib6_flags & RTF_NONEXTHOP ||
598             !(rt->fib6_flags & RTF_GATEWAY))
599                 return RT6_NUD_SUCCEED;
600
601         rcu_read_lock_bh();
602         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603                                           &rt->fib6_nh.nh_gw);
604         if (neigh) {
605                 read_lock(&neigh->lock);
606                 if (neigh->nud_state & NUD_VALID)
607                         ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609                 else if (!(neigh->nud_state & NUD_FAILED))
610                         ret = RT6_NUD_SUCCEED;
611                 else
612                         ret = RT6_NUD_FAIL_PROBE;
613 #endif
614                 read_unlock(&neigh->lock);
615         } else {
616                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618         }
619         rcu_read_unlock_bh();
620
621         return ret;
622 }
623
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
625 {
626         int m;
627
628         m = rt6_check_dev(rt, oif);
629         if (!m && (strict & RT6_LOOKUP_F_IFACE))
630                 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633 #endif
634         if (strict & RT6_LOOKUP_F_REACHABLE) {
635                 int n = rt6_check_neigh(rt);
636                 if (n < 0)
637                         return n;
638         }
639         return m;
640 }
641
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
644 {
645         const struct net_device *dev = fib6_info_nh_dev(f6i);
646         bool rc = false;
647
648         if (dev) {
649                 const struct inet6_dev *idev = __in6_dev_get(dev);
650
651                 rc = !!idev->cnf.ignore_routes_with_linkdown;
652         }
653
654         return rc;
655 }
656
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658                                    int *mpri, struct fib6_info *match,
659                                    bool *do_rr)
660 {
661         int m;
662         bool match_do_rr = false;
663
664         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665                 goto out;
666
667         if (fib6_ignore_linkdown(rt) &&
668             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670                 goto out;
671
672         if (fib6_check_expired(rt))
673                 goto out;
674
675         m = rt6_score_route(rt, oif, strict);
676         if (m == RT6_NUD_FAIL_DO_RR) {
677                 match_do_rr = true;
678                 m = 0; /* lowest valid score */
679         } else if (m == RT6_NUD_FAIL_HARD) {
680                 goto out;
681         }
682
683         if (strict & RT6_LOOKUP_F_REACHABLE)
684                 rt6_probe(rt);
685
686         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
687         if (m > *mpri) {
688                 *do_rr = match_do_rr;
689                 *mpri = m;
690                 match = rt;
691         }
692 out:
693         return match;
694 }
695
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697                                      struct fib6_info *leaf,
698                                      struct fib6_info *rr_head,
699                                      u32 metric, int oif, int strict,
700                                      bool *do_rr)
701 {
702         struct fib6_info *rt, *match, *cont;
703         int mpri = -1;
704
705         match = NULL;
706         cont = NULL;
707         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708                 if (rt->fib6_metric != metric) {
709                         cont = rt;
710                         break;
711                 }
712
713                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714         }
715
716         for (rt = leaf; rt && rt != rr_head;
717              rt = rcu_dereference(rt->fib6_next)) {
718                 if (rt->fib6_metric != metric) {
719                         cont = rt;
720                         break;
721                 }
722
723                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
724         }
725
726         if (match || !cont)
727                 return match;
728
729         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
731
732         return match;
733 }
734
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736                                    int oif, int strict)
737 {
738         struct fib6_info *leaf = rcu_dereference(fn->leaf);
739         struct fib6_info *match, *rt0;
740         bool do_rr = false;
741         int key_plen;
742
743         if (!leaf || leaf == net->ipv6.fib6_null_entry)
744                 return net->ipv6.fib6_null_entry;
745
746         rt0 = rcu_dereference(fn->rr_ptr);
747         if (!rt0)
748                 rt0 = leaf;
749
750         /* Double check to make sure fn is not an intermediate node
751          * and fn->leaf does not points to its child's leaf
752          * (This might happen if all routes under fn are deleted from
753          * the tree and fib6_repair_tree() is called on the node.)
754          */
755         key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757         if (rt0->fib6_src.plen)
758                 key_plen = rt0->fib6_src.plen;
759 #endif
760         if (fn->fn_bit != key_plen)
761                 return net->ipv6.fib6_null_entry;
762
763         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
764                              &do_rr);
765
766         if (do_rr) {
767                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
768
769                 /* no entries matched; do round-robin */
770                 if (!next || next->fib6_metric != rt0->fib6_metric)
771                         next = leaf;
772
773                 if (next != rt0) {
774                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
775                         /* make sure next is not being deleted from the tree */
776                         if (next->fib6_node)
777                                 rcu_assign_pointer(fn->rr_ptr, next);
778                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
779                 }
780         }
781
782         return match ? match : net->ipv6.fib6_null_entry;
783 }
784
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
786 {
787         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 }
789
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792                   const struct in6_addr *gwaddr)
793 {
794         struct net *net = dev_net(dev);
795         struct route_info *rinfo = (struct route_info *) opt;
796         struct in6_addr prefix_buf, *prefix;
797         unsigned int pref;
798         unsigned long lifetime;
799         struct fib6_info *rt;
800
801         if (len < sizeof(struct route_info)) {
802                 return -EINVAL;
803         }
804
805         /* Sanity check for prefix_len and length */
806         if (rinfo->length > 3) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 128) {
809                 return -EINVAL;
810         } else if (rinfo->prefix_len > 64) {
811                 if (rinfo->length < 2) {
812                         return -EINVAL;
813                 }
814         } else if (rinfo->prefix_len > 0) {
815                 if (rinfo->length < 1) {
816                         return -EINVAL;
817                 }
818         }
819
820         pref = rinfo->route_pref;
821         if (pref == ICMPV6_ROUTER_PREF_INVALID)
822                 return -EINVAL;
823
824         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
825
826         if (rinfo->length == 3)
827                 prefix = (struct in6_addr *)rinfo->prefix;
828         else {
829                 /* this function is safe */
830                 ipv6_addr_prefix(&prefix_buf,
831                                  (struct in6_addr *)rinfo->prefix,
832                                  rinfo->prefix_len);
833                 prefix = &prefix_buf;
834         }
835
836         if (rinfo->prefix_len == 0)
837                 rt = rt6_get_dflt_router(net, gwaddr, dev);
838         else
839                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840                                         gwaddr, dev);
841
842         if (rt && !lifetime) {
843                 ip6_del_rt(net, rt);
844                 rt = NULL;
845         }
846
847         if (!rt && lifetime)
848                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849                                         dev, pref);
850         else if (rt)
851                 rt->fib6_flags = RTF_ROUTEINFO |
852                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853
854         if (rt) {
855                 if (!addrconf_finite_timeout(lifetime))
856                         fib6_clean_expires(rt);
857                 else
858                         fib6_set_expires(rt, jiffies + HZ * lifetime);
859
860                 fib6_info_release(rt);
861         }
862         return 0;
863 }
864 #endif
865
866 /*
867  *      Misc support functions
868  */
869
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
872 {
873         struct net_device *dev = rt->fib6_nh.nh_dev;
874
875         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876                 /* for copies of local routes, dst->dev needs to be the
877                  * device if it is a master device, the master device if
878                  * device is enslaved, and the loopback as the default
879                  */
880                 if (netif_is_l3_slave(dev) &&
881                     !rt6_need_strict(&rt->fib6_dst.addr))
882                         dev = l3mdev_master_dev_rcu(dev);
883                 else if (!netif_is_l3_master(dev))
884                         dev = dev_net(dev)->loopback_dev;
885                 /* last case is netif_is_l3_master(dev) is true in which
886                  * case we want dev returned to be dev
887                  */
888         }
889
890         return dev;
891 }
892
893 static const int fib6_prop[RTN_MAX + 1] = {
894         [RTN_UNSPEC]    = 0,
895         [RTN_UNICAST]   = 0,
896         [RTN_LOCAL]     = 0,
897         [RTN_BROADCAST] = 0,
898         [RTN_ANYCAST]   = 0,
899         [RTN_MULTICAST] = 0,
900         [RTN_BLACKHOLE] = -EINVAL,
901         [RTN_UNREACHABLE] = -EHOSTUNREACH,
902         [RTN_PROHIBIT]  = -EACCES,
903         [RTN_THROW]     = -EAGAIN,
904         [RTN_NAT]       = -EINVAL,
905         [RTN_XRESOLVE]  = -EINVAL,
906 };
907
908 static int ip6_rt_type_to_error(u8 fib6_type)
909 {
910         return fib6_prop[fib6_type];
911 }
912
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
914 {
915         unsigned short flags = 0;
916
917         if (rt->dst_nocount)
918                 flags |= DST_NOCOUNT;
919         if (rt->dst_nopolicy)
920                 flags |= DST_NOPOLICY;
921         if (rt->dst_host)
922                 flags |= DST_HOST;
923
924         return flags;
925 }
926
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
928 {
929         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
930
931         switch (ort->fib6_type) {
932         case RTN_BLACKHOLE:
933                 rt->dst.output = dst_discard_out;
934                 rt->dst.input = dst_discard;
935                 break;
936         case RTN_PROHIBIT:
937                 rt->dst.output = ip6_pkt_prohibit_out;
938                 rt->dst.input = ip6_pkt_prohibit;
939                 break;
940         case RTN_THROW:
941         case RTN_UNREACHABLE:
942         default:
943                 rt->dst.output = ip6_pkt_discard_out;
944                 rt->dst.input = ip6_pkt_discard;
945                 break;
946         }
947 }
948
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
950 {
951         if (ort->fib6_flags & RTF_REJECT) {
952                 ip6_rt_init_dst_reject(rt, ort);
953                 return;
954         }
955
956         rt->dst.error = 0;
957         rt->dst.output = ip6_output;
958
959         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960                 rt->dst.input = ip6_input;
961         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962                 rt->dst.input = ip6_mc_input;
963         } else {
964                 rt->dst.input = ip6_forward;
965         }
966
967         if (ort->fib6_nh.nh_lwtstate) {
968                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969                 lwtunnel_set_redirect(&rt->dst);
970         }
971
972         rt->dst.lastuse = jiffies;
973 }
974
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 {
978         rt->rt6i_flags &= ~RTF_EXPIRES;
979         rcu_assign_pointer(rt->from, from);
980         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
981 }
982
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
985 {
986         struct net_device *dev = fib6_info_nh_dev(ort);
987
988         ip6_rt_init_dst(rt, ort);
989
990         rt->rt6i_dst = ort->fib6_dst;
991         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993         rt->rt6i_flags = ort->fib6_flags;
994         rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996         rt->rt6i_src = ort->fib6_src;
997 #endif
998 }
999
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001                                         struct in6_addr *saddr)
1002 {
1003         struct fib6_node *pn, *sn;
1004         while (1) {
1005                 if (fn->fn_flags & RTN_TL_ROOT)
1006                         return NULL;
1007                 pn = rcu_dereference(fn->parent);
1008                 sn = FIB6_SUBTREE(pn);
1009                 if (sn && sn != fn)
1010                         fn = fib6_node_lookup(sn, NULL, saddr);
1011                 else
1012                         fn = pn;
1013                 if (fn->fn_flags & RTN_RTINFO)
1014                         return fn;
1015         }
1016 }
1017
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1019                           bool null_fallback)
1020 {
1021         struct rt6_info *rt = *prt;
1022
1023         if (dst_hold_safe(&rt->dst))
1024                 return true;
1025         if (null_fallback) {
1026                 rt = net->ipv6.ip6_null_entry;
1027                 dst_hold(&rt->dst);
1028         } else {
1029                 rt = NULL;
1030         }
1031         *prt = rt;
1032         return false;
1033 }
1034
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038         unsigned short flags = fib6_info_dst_flags(rt);
1039         struct net_device *dev = rt->fib6_nh.nh_dev;
1040         struct rt6_info *nrt;
1041
1042         if (!fib6_info_hold_safe(rt))
1043                 return NULL;
1044
1045         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046         if (nrt)
1047                 ip6_rt_copy_init(nrt, rt);
1048         else
1049                 fib6_info_release(rt);
1050
1051         return nrt;
1052 }
1053
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055                                              struct fib6_table *table,
1056                                              struct flowi6 *fl6,
1057                                              const struct sk_buff *skb,
1058                                              int flags)
1059 {
1060         struct fib6_info *f6i;
1061         struct fib6_node *fn;
1062         struct rt6_info *rt;
1063
1064         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065                 flags &= ~RT6_LOOKUP_F_IFACE;
1066
1067         rcu_read_lock();
1068         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 restart:
1070         f6i = rcu_dereference(fn->leaf);
1071         if (!f6i) {
1072                 f6i = net->ipv6.fib6_null_entry;
1073         } else {
1074                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075                                       fl6->flowi6_oif, flags);
1076                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077                         f6i = fib6_multipath_select(net, f6i, fl6,
1078                                                     fl6->flowi6_oif, skb,
1079                                                     flags);
1080         }
1081         if (f6i == net->ipv6.fib6_null_entry) {
1082                 fn = fib6_backtrack(fn, &fl6->saddr);
1083                 if (fn)
1084                         goto restart;
1085         }
1086
1087         trace_fib6_table_lookup(net, f6i, table, fl6);
1088
1089         /* Search through exception table */
1090         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091         if (rt) {
1092                 if (ip6_hold_safe(net, &rt, true))
1093                         dst_use_noref(&rt->dst, jiffies);
1094         } else if (f6i == net->ipv6.fib6_null_entry) {
1095                 rt = net->ipv6.ip6_null_entry;
1096                 dst_hold(&rt->dst);
1097         } else {
1098                 rt = ip6_create_rt_rcu(f6i);
1099                 if (!rt) {
1100                         rt = net->ipv6.ip6_null_entry;
1101                         dst_hold(&rt->dst);
1102                 }
1103         }
1104
1105         rcu_read_unlock();
1106
1107         return rt;
1108 }
1109
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111                                    const struct sk_buff *skb, int flags)
1112 {
1113         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 }
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118                             const struct in6_addr *saddr, int oif,
1119                             const struct sk_buff *skb, int strict)
1120 {
1121         struct flowi6 fl6 = {
1122                 .flowi6_oif = oif,
1123                 .daddr = *daddr,
1124         };
1125         struct dst_entry *dst;
1126         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1127
1128         if (saddr) {
1129                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1131         }
1132
1133         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134         if (dst->error == 0)
1135                 return (struct rt6_info *) dst;
1136
1137         dst_release(dst);
1138
1139         return NULL;
1140 }
1141 EXPORT_SYMBOL(rt6_lookup);
1142
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144  * It takes new route entry, the addition fails by any reason the
1145  * route is released.
1146  * Caller must hold dst before calling it.
1147  */
1148
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150                         struct netlink_ext_ack *extack)
1151 {
1152         int err;
1153         struct fib6_table *table;
1154
1155         table = rt->fib6_table;
1156         spin_lock_bh(&table->tb6_lock);
1157         err = fib6_add(&table->tb6_root, rt, info, extack);
1158         spin_unlock_bh(&table->tb6_lock);
1159
1160         return err;
1161 }
1162
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 {
1165         struct nl_info info = { .nl_net = net, };
1166
1167         return __ip6_ins_rt(rt, &info, NULL);
1168 }
1169
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171                                            const struct in6_addr *daddr,
1172                                            const struct in6_addr *saddr)
1173 {
1174         struct net_device *dev;
1175         struct rt6_info *rt;
1176
1177         /*
1178          *      Clone the route.
1179          */
1180
1181         if (!fib6_info_hold_safe(ort))
1182                 return NULL;
1183
1184         dev = ip6_rt_get_dev_rcu(ort);
1185         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186         if (!rt) {
1187                 fib6_info_release(ort);
1188                 return NULL;
1189         }
1190
1191         ip6_rt_copy_init(rt, ort);
1192         rt->rt6i_flags |= RTF_CACHE;
1193         rt->dst.flags |= DST_HOST;
1194         rt->rt6i_dst.addr = *daddr;
1195         rt->rt6i_dst.plen = 128;
1196
1197         if (!rt6_is_gw_or_nonexthop(ort)) {
1198                 if (ort->fib6_dst.plen != 128 &&
1199                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1200                         rt->rt6i_flags |= RTF_ANYCAST;
1201 #ifdef CONFIG_IPV6_SUBTREES
1202                 if (rt->rt6i_src.plen && saddr) {
1203                         rt->rt6i_src.addr = *saddr;
1204                         rt->rt6i_src.plen = 128;
1205                 }
1206 #endif
1207         }
1208
1209         return rt;
1210 }
1211
1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1213 {
1214         unsigned short flags = fib6_info_dst_flags(rt);
1215         struct net_device *dev;
1216         struct rt6_info *pcpu_rt;
1217
1218         if (!fib6_info_hold_safe(rt))
1219                 return NULL;
1220
1221         rcu_read_lock();
1222         dev = ip6_rt_get_dev_rcu(rt);
1223         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1224         rcu_read_unlock();
1225         if (!pcpu_rt) {
1226                 fib6_info_release(rt);
1227                 return NULL;
1228         }
1229         ip6_rt_copy_init(pcpu_rt, rt);
1230         pcpu_rt->rt6i_flags |= RTF_PCPU;
1231         return pcpu_rt;
1232 }
1233
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1236 {
1237         struct rt6_info *pcpu_rt, **p;
1238
1239         p = this_cpu_ptr(rt->rt6i_pcpu);
1240         pcpu_rt = *p;
1241
1242         if (pcpu_rt)
1243                 ip6_hold_safe(NULL, &pcpu_rt, false);
1244
1245         return pcpu_rt;
1246 }
1247
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249                                             struct fib6_info *rt)
1250 {
1251         struct rt6_info *pcpu_rt, *prev, **p;
1252
1253         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1254         if (!pcpu_rt) {
1255                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256                 return net->ipv6.ip6_null_entry;
1257         }
1258
1259         dst_hold(&pcpu_rt->dst);
1260         p = this_cpu_ptr(rt->rt6i_pcpu);
1261         prev = cmpxchg(p, NULL, pcpu_rt);
1262         BUG_ON(prev);
1263
1264         return pcpu_rt;
1265 }
1266
1267 /* exception hash table implementation
1268  */
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1270
1271 /* Remove rt6_ex from hash table and free the memory
1272  * Caller must hold rt6_exception_lock
1273  */
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275                                  struct rt6_exception *rt6_ex)
1276 {
1277         struct fib6_info *from;
1278         struct net *net;
1279
1280         if (!bucket || !rt6_ex)
1281                 return;
1282
1283         net = dev_net(rt6_ex->rt6i->dst.dev);
1284         net->ipv6.rt6_stats->fib_rt_cache--;
1285
1286         /* purge completely the exception to allow releasing the held resources:
1287          * some [sk] cache may keep the dst around for unlimited time
1288          */
1289         from = rcu_dereference_protected(rt6_ex->rt6i->from,
1290                                          lockdep_is_held(&rt6_exception_lock));
1291         rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1292         fib6_info_release(from);
1293         dst_dev_put(&rt6_ex->rt6i->dst);
1294
1295         hlist_del_rcu(&rt6_ex->hlist);
1296         dst_release(&rt6_ex->rt6i->dst);
1297         kfree_rcu(rt6_ex, rcu);
1298         WARN_ON_ONCE(!bucket->depth);
1299         bucket->depth--;
1300 }
1301
1302 /* Remove oldest rt6_ex in bucket and free the memory
1303  * Caller must hold rt6_exception_lock
1304  */
1305 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1306 {
1307         struct rt6_exception *rt6_ex, *oldest = NULL;
1308
1309         if (!bucket)
1310                 return;
1311
1312         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1313                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1314                         oldest = rt6_ex;
1315         }
1316         rt6_remove_exception(bucket, oldest);
1317 }
1318
1319 static u32 rt6_exception_hash(const struct in6_addr *dst,
1320                               const struct in6_addr *src)
1321 {
1322         static u32 seed __read_mostly;
1323         u32 val;
1324
1325         net_get_random_once(&seed, sizeof(seed));
1326         val = jhash(dst, sizeof(*dst), seed);
1327
1328 #ifdef CONFIG_IPV6_SUBTREES
1329         if (src)
1330                 val = jhash(src, sizeof(*src), val);
1331 #endif
1332         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1333 }
1334
1335 /* Helper function to find the cached rt in the hash table
1336  * and update bucket pointer to point to the bucket for this
1337  * (daddr, saddr) pair
1338  * Caller must hold rt6_exception_lock
1339  */
1340 static struct rt6_exception *
1341 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1342                               const struct in6_addr *daddr,
1343                               const struct in6_addr *saddr)
1344 {
1345         struct rt6_exception *rt6_ex;
1346         u32 hval;
1347
1348         if (!(*bucket) || !daddr)
1349                 return NULL;
1350
1351         hval = rt6_exception_hash(daddr, saddr);
1352         *bucket += hval;
1353
1354         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1355                 struct rt6_info *rt6 = rt6_ex->rt6i;
1356                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1357
1358 #ifdef CONFIG_IPV6_SUBTREES
1359                 if (matched && saddr)
1360                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1361 #endif
1362                 if (matched)
1363                         return rt6_ex;
1364         }
1365         return NULL;
1366 }
1367
1368 /* Helper function to find the cached rt in the hash table
1369  * and update bucket pointer to point to the bucket for this
1370  * (daddr, saddr) pair
1371  * Caller must hold rcu_read_lock()
1372  */
1373 static struct rt6_exception *
1374 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1375                          const struct in6_addr *daddr,
1376                          const struct in6_addr *saddr)
1377 {
1378         struct rt6_exception *rt6_ex;
1379         u32 hval;
1380
1381         WARN_ON_ONCE(!rcu_read_lock_held());
1382
1383         if (!(*bucket) || !daddr)
1384                 return NULL;
1385
1386         hval = rt6_exception_hash(daddr, saddr);
1387         *bucket += hval;
1388
1389         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1390                 struct rt6_info *rt6 = rt6_ex->rt6i;
1391                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1392
1393 #ifdef CONFIG_IPV6_SUBTREES
1394                 if (matched && saddr)
1395                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1396 #endif
1397                 if (matched)
1398                         return rt6_ex;
1399         }
1400         return NULL;
1401 }
1402
1403 static unsigned int fib6_mtu(const struct fib6_info *rt)
1404 {
1405         unsigned int mtu;
1406
1407         if (rt->fib6_pmtu) {
1408                 mtu = rt->fib6_pmtu;
1409         } else {
1410                 struct net_device *dev = fib6_info_nh_dev(rt);
1411                 struct inet6_dev *idev;
1412
1413                 rcu_read_lock();
1414                 idev = __in6_dev_get(dev);
1415                 mtu = idev->cnf.mtu6;
1416                 rcu_read_unlock();
1417         }
1418
1419         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1420
1421         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1422 }
1423
1424 static int rt6_insert_exception(struct rt6_info *nrt,
1425                                 struct fib6_info *ort)
1426 {
1427         struct net *net = dev_net(nrt->dst.dev);
1428         struct rt6_exception_bucket *bucket;
1429         struct in6_addr *src_key = NULL;
1430         struct rt6_exception *rt6_ex;
1431         int err = 0;
1432
1433         spin_lock_bh(&rt6_exception_lock);
1434
1435         if (ort->exception_bucket_flushed) {
1436                 err = -EINVAL;
1437                 goto out;
1438         }
1439
1440         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1441                                         lockdep_is_held(&rt6_exception_lock));
1442         if (!bucket) {
1443                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1444                                  GFP_ATOMIC);
1445                 if (!bucket) {
1446                         err = -ENOMEM;
1447                         goto out;
1448                 }
1449                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1450         }
1451
1452 #ifdef CONFIG_IPV6_SUBTREES
1453         /* rt6i_src.plen != 0 indicates ort is in subtree
1454          * and exception table is indexed by a hash of
1455          * both rt6i_dst and rt6i_src.
1456          * Otherwise, the exception table is indexed by
1457          * a hash of only rt6i_dst.
1458          */
1459         if (ort->fib6_src.plen)
1460                 src_key = &nrt->rt6i_src.addr;
1461 #endif
1462         /* rt6_mtu_change() might lower mtu on ort.
1463          * Only insert this exception route if its mtu
1464          * is less than ort's mtu value.
1465          */
1466         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1467                 err = -EINVAL;
1468                 goto out;
1469         }
1470
1471         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1472                                                src_key);
1473         if (rt6_ex)
1474                 rt6_remove_exception(bucket, rt6_ex);
1475
1476         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1477         if (!rt6_ex) {
1478                 err = -ENOMEM;
1479                 goto out;
1480         }
1481         rt6_ex->rt6i = nrt;
1482         rt6_ex->stamp = jiffies;
1483         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1484         bucket->depth++;
1485         net->ipv6.rt6_stats->fib_rt_cache++;
1486
1487         if (bucket->depth > FIB6_MAX_DEPTH)
1488                 rt6_exception_remove_oldest(bucket);
1489
1490 out:
1491         spin_unlock_bh(&rt6_exception_lock);
1492
1493         /* Update fn->fn_sernum to invalidate all cached dst */
1494         if (!err) {
1495                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1496                 fib6_update_sernum(net, ort);
1497                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1498                 fib6_force_start_gc(net);
1499         }
1500
1501         return err;
1502 }
1503
1504 void rt6_flush_exceptions(struct fib6_info *rt)
1505 {
1506         struct rt6_exception_bucket *bucket;
1507         struct rt6_exception *rt6_ex;
1508         struct hlist_node *tmp;
1509         int i;
1510
1511         spin_lock_bh(&rt6_exception_lock);
1512         /* Prevent rt6_insert_exception() to recreate the bucket list */
1513         rt->exception_bucket_flushed = 1;
1514
1515         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1516                                     lockdep_is_held(&rt6_exception_lock));
1517         if (!bucket)
1518                 goto out;
1519
1520         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1521                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1522                         rt6_remove_exception(bucket, rt6_ex);
1523                 WARN_ON_ONCE(bucket->depth);
1524                 bucket++;
1525         }
1526
1527 out:
1528         spin_unlock_bh(&rt6_exception_lock);
1529 }
1530
1531 /* Find cached rt in the hash table inside passed in rt
1532  * Caller has to hold rcu_read_lock()
1533  */
1534 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1535                                            struct in6_addr *daddr,
1536                                            struct in6_addr *saddr)
1537 {
1538         struct rt6_exception_bucket *bucket;
1539         struct in6_addr *src_key = NULL;
1540         struct rt6_exception *rt6_ex;
1541         struct rt6_info *res = NULL;
1542
1543         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1544
1545 #ifdef CONFIG_IPV6_SUBTREES
1546         /* rt6i_src.plen != 0 indicates rt is in subtree
1547          * and exception table is indexed by a hash of
1548          * both rt6i_dst and rt6i_src.
1549          * Otherwise, the exception table is indexed by
1550          * a hash of only rt6i_dst.
1551          */
1552         if (rt->fib6_src.plen)
1553                 src_key = saddr;
1554 #endif
1555         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1556
1557         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1558                 res = rt6_ex->rt6i;
1559
1560         return res;
1561 }
1562
1563 /* Remove the passed in cached rt from the hash table that contains it */
1564 static int rt6_remove_exception_rt(struct rt6_info *rt)
1565 {
1566         struct rt6_exception_bucket *bucket;
1567         struct in6_addr *src_key = NULL;
1568         struct rt6_exception *rt6_ex;
1569         struct fib6_info *from;
1570         int err;
1571
1572         from = rcu_dereference(rt->from);
1573         if (!from ||
1574             !(rt->rt6i_flags & RTF_CACHE))
1575                 return -EINVAL;
1576
1577         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1578                 return -ENOENT;
1579
1580         spin_lock_bh(&rt6_exception_lock);
1581         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1582                                     lockdep_is_held(&rt6_exception_lock));
1583 #ifdef CONFIG_IPV6_SUBTREES
1584         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1585          * and exception table is indexed by a hash of
1586          * both rt6i_dst and rt6i_src.
1587          * Otherwise, the exception table is indexed by
1588          * a hash of only rt6i_dst.
1589          */
1590         if (from->fib6_src.plen)
1591                 src_key = &rt->rt6i_src.addr;
1592 #endif
1593         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1594                                                &rt->rt6i_dst.addr,
1595                                                src_key);
1596         if (rt6_ex) {
1597                 rt6_remove_exception(bucket, rt6_ex);
1598                 err = 0;
1599         } else {
1600                 err = -ENOENT;
1601         }
1602
1603         spin_unlock_bh(&rt6_exception_lock);
1604         return err;
1605 }
1606
1607 /* Find rt6_ex which contains the passed in rt cache and
1608  * refresh its stamp
1609  */
1610 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1611 {
1612         struct rt6_exception_bucket *bucket;
1613         struct in6_addr *src_key = NULL;
1614         struct rt6_exception *rt6_ex;
1615         struct fib6_info *from;
1616
1617         rcu_read_lock();
1618         from = rcu_dereference(rt->from);
1619         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1620                 goto unlock;
1621
1622         bucket = rcu_dereference(from->rt6i_exception_bucket);
1623
1624 #ifdef CONFIG_IPV6_SUBTREES
1625         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1626          * and exception table is indexed by a hash of
1627          * both rt6i_dst and rt6i_src.
1628          * Otherwise, the exception table is indexed by
1629          * a hash of only rt6i_dst.
1630          */
1631         if (from->fib6_src.plen)
1632                 src_key = &rt->rt6i_src.addr;
1633 #endif
1634         rt6_ex = __rt6_find_exception_rcu(&bucket,
1635                                           &rt->rt6i_dst.addr,
1636                                           src_key);
1637         if (rt6_ex)
1638                 rt6_ex->stamp = jiffies;
1639
1640 unlock:
1641         rcu_read_unlock();
1642 }
1643
1644 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1645                                          struct rt6_info *rt, int mtu)
1646 {
1647         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1648          * lowest MTU in the path: always allow updating the route PMTU to
1649          * reflect PMTU decreases.
1650          *
1651          * If the new MTU is higher, and the route PMTU is equal to the local
1652          * MTU, this means the old MTU is the lowest in the path, so allow
1653          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1654          * handle this.
1655          */
1656
1657         if (dst_mtu(&rt->dst) >= mtu)
1658                 return true;
1659
1660         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1661                 return true;
1662
1663         return false;
1664 }
1665
1666 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1667                                        struct fib6_info *rt, int mtu)
1668 {
1669         struct rt6_exception_bucket *bucket;
1670         struct rt6_exception *rt6_ex;
1671         int i;
1672
1673         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1674                                         lockdep_is_held(&rt6_exception_lock));
1675
1676         if (!bucket)
1677                 return;
1678
1679         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1680                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1681                         struct rt6_info *entry = rt6_ex->rt6i;
1682
1683                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1684                          * route), the metrics of its rt->from have already
1685                          * been updated.
1686                          */
1687                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1688                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1689                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1690                 }
1691                 bucket++;
1692         }
1693 }
1694
1695 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1696
1697 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1698                                         struct in6_addr *gateway)
1699 {
1700         struct rt6_exception_bucket *bucket;
1701         struct rt6_exception *rt6_ex;
1702         struct hlist_node *tmp;
1703         int i;
1704
1705         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1706                 return;
1707
1708         spin_lock_bh(&rt6_exception_lock);
1709         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1710                                      lockdep_is_held(&rt6_exception_lock));
1711
1712         if (bucket) {
1713                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1714                         hlist_for_each_entry_safe(rt6_ex, tmp,
1715                                                   &bucket->chain, hlist) {
1716                                 struct rt6_info *entry = rt6_ex->rt6i;
1717
1718                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1719                                     RTF_CACHE_GATEWAY &&
1720                                     ipv6_addr_equal(gateway,
1721                                                     &entry->rt6i_gateway)) {
1722                                         rt6_remove_exception(bucket, rt6_ex);
1723                                 }
1724                         }
1725                         bucket++;
1726                 }
1727         }
1728
1729         spin_unlock_bh(&rt6_exception_lock);
1730 }
1731
1732 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1733                                       struct rt6_exception *rt6_ex,
1734                                       struct fib6_gc_args *gc_args,
1735                                       unsigned long now)
1736 {
1737         struct rt6_info *rt = rt6_ex->rt6i;
1738
1739         /* we are pruning and obsoleting aged-out and non gateway exceptions
1740          * even if others have still references to them, so that on next
1741          * dst_check() such references can be dropped.
1742          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1743          * expired, independently from their aging, as per RFC 8201 section 4
1744          */
1745         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1746                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1747                         RT6_TRACE("aging clone %p\n", rt);
1748                         rt6_remove_exception(bucket, rt6_ex);
1749                         return;
1750                 }
1751         } else if (time_after(jiffies, rt->dst.expires)) {
1752                 RT6_TRACE("purging expired route %p\n", rt);
1753                 rt6_remove_exception(bucket, rt6_ex);
1754                 return;
1755         }
1756
1757         if (rt->rt6i_flags & RTF_GATEWAY) {
1758                 struct neighbour *neigh;
1759                 __u8 neigh_flags = 0;
1760
1761                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1762                 if (neigh)
1763                         neigh_flags = neigh->flags;
1764
1765                 if (!(neigh_flags & NTF_ROUTER)) {
1766                         RT6_TRACE("purging route %p via non-router but gateway\n",
1767                                   rt);
1768                         rt6_remove_exception(bucket, rt6_ex);
1769                         return;
1770                 }
1771         }
1772
1773         gc_args->more++;
1774 }
1775
1776 void rt6_age_exceptions(struct fib6_info *rt,
1777                         struct fib6_gc_args *gc_args,
1778                         unsigned long now)
1779 {
1780         struct rt6_exception_bucket *bucket;
1781         struct rt6_exception *rt6_ex;
1782         struct hlist_node *tmp;
1783         int i;
1784
1785         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1786                 return;
1787
1788         rcu_read_lock_bh();
1789         spin_lock(&rt6_exception_lock);
1790         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1791                                     lockdep_is_held(&rt6_exception_lock));
1792
1793         if (bucket) {
1794                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1795                         hlist_for_each_entry_safe(rt6_ex, tmp,
1796                                                   &bucket->chain, hlist) {
1797                                 rt6_age_examine_exception(bucket, rt6_ex,
1798                                                           gc_args, now);
1799                         }
1800                         bucket++;
1801                 }
1802         }
1803         spin_unlock(&rt6_exception_lock);
1804         rcu_read_unlock_bh();
1805 }
1806
1807 /* must be called with rcu lock held */
1808 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1809                                     int oif, struct flowi6 *fl6, int strict)
1810 {
1811         struct fib6_node *fn, *saved_fn;
1812         struct fib6_info *f6i;
1813
1814         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1815         saved_fn = fn;
1816
1817         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1818                 oif = 0;
1819
1820 redo_rt6_select:
1821         f6i = rt6_select(net, fn, oif, strict);
1822         if (f6i == net->ipv6.fib6_null_entry) {
1823                 fn = fib6_backtrack(fn, &fl6->saddr);
1824                 if (fn)
1825                         goto redo_rt6_select;
1826                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1827                         /* also consider unreachable route */
1828                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1829                         fn = saved_fn;
1830                         goto redo_rt6_select;
1831                 }
1832         }
1833
1834         trace_fib6_table_lookup(net, f6i, table, fl6);
1835
1836         return f6i;
1837 }
1838
1839 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1840                                int oif, struct flowi6 *fl6,
1841                                const struct sk_buff *skb, int flags)
1842 {
1843         struct fib6_info *f6i;
1844         struct rt6_info *rt;
1845         int strict = 0;
1846
1847         strict |= flags & RT6_LOOKUP_F_IFACE;
1848         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1849         if (net->ipv6.devconf_all->forwarding == 0)
1850                 strict |= RT6_LOOKUP_F_REACHABLE;
1851
1852         rcu_read_lock();
1853
1854         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1855         if (f6i->fib6_nsiblings)
1856                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1857
1858         if (f6i == net->ipv6.fib6_null_entry) {
1859                 rt = net->ipv6.ip6_null_entry;
1860                 rcu_read_unlock();
1861                 dst_hold(&rt->dst);
1862                 return rt;
1863         }
1864
1865         /*Search through exception table */
1866         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1867         if (rt) {
1868                 if (ip6_hold_safe(net, &rt, true))
1869                         dst_use_noref(&rt->dst, jiffies);
1870
1871                 rcu_read_unlock();
1872                 return rt;
1873         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1874                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1875                 /* Create a RTF_CACHE clone which will not be
1876                  * owned by the fib6 tree.  It is for the special case where
1877                  * the daddr in the skb during the neighbor look-up is different
1878                  * from the fl6->daddr used to look-up route here.
1879                  */
1880                 struct rt6_info *uncached_rt;
1881
1882                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1883
1884                 rcu_read_unlock();
1885
1886                 if (uncached_rt) {
1887                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1888                          * No need for another dst_hold()
1889                          */
1890                         rt6_uncached_list_add(uncached_rt);
1891                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1892                 } else {
1893                         uncached_rt = net->ipv6.ip6_null_entry;
1894                         dst_hold(&uncached_rt->dst);
1895                 }
1896
1897                 return uncached_rt;
1898         } else {
1899                 /* Get a percpu copy */
1900
1901                 struct rt6_info *pcpu_rt;
1902
1903                 local_bh_disable();
1904                 pcpu_rt = rt6_get_pcpu_route(f6i);
1905
1906                 if (!pcpu_rt)
1907                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1908
1909                 local_bh_enable();
1910                 rcu_read_unlock();
1911
1912                 return pcpu_rt;
1913         }
1914 }
1915 EXPORT_SYMBOL_GPL(ip6_pol_route);
1916
1917 static struct rt6_info *ip6_pol_route_input(struct net *net,
1918                                             struct fib6_table *table,
1919                                             struct flowi6 *fl6,
1920                                             const struct sk_buff *skb,
1921                                             int flags)
1922 {
1923         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1924 }
1925
1926 struct dst_entry *ip6_route_input_lookup(struct net *net,
1927                                          struct net_device *dev,
1928                                          struct flowi6 *fl6,
1929                                          const struct sk_buff *skb,
1930                                          int flags)
1931 {
1932         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1933                 flags |= RT6_LOOKUP_F_IFACE;
1934
1935         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1936 }
1937 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1938
1939 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1940                                   struct flow_keys *keys,
1941                                   struct flow_keys *flkeys)
1942 {
1943         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1944         const struct ipv6hdr *key_iph = outer_iph;
1945         struct flow_keys *_flkeys = flkeys;
1946         const struct ipv6hdr *inner_iph;
1947         const struct icmp6hdr *icmph;
1948         struct ipv6hdr _inner_iph;
1949         struct icmp6hdr _icmph;
1950
1951         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1952                 goto out;
1953
1954         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1955                                    sizeof(_icmph), &_icmph);
1956         if (!icmph)
1957                 goto out;
1958
1959         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1960             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1961             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1962             icmph->icmp6_type != ICMPV6_PARAMPROB)
1963                 goto out;
1964
1965         inner_iph = skb_header_pointer(skb,
1966                                        skb_transport_offset(skb) + sizeof(*icmph),
1967                                        sizeof(_inner_iph), &_inner_iph);
1968         if (!inner_iph)
1969                 goto out;
1970
1971         key_iph = inner_iph;
1972         _flkeys = NULL;
1973 out:
1974         if (_flkeys) {
1975                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1976                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1977                 keys->tags.flow_label = _flkeys->tags.flow_label;
1978                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1979         } else {
1980                 keys->addrs.v6addrs.src = key_iph->saddr;
1981                 keys->addrs.v6addrs.dst = key_iph->daddr;
1982                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1983                 keys->basic.ip_proto = key_iph->nexthdr;
1984         }
1985 }
1986
1987 /* if skb is set it will be used and fl6 can be NULL */
1988 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1989                        const struct sk_buff *skb, struct flow_keys *flkeys)
1990 {
1991         struct flow_keys hash_keys;
1992         u32 mhash;
1993
1994         switch (ip6_multipath_hash_policy(net)) {
1995         case 0:
1996                 memset(&hash_keys, 0, sizeof(hash_keys));
1997                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1998                 if (skb) {
1999                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2000                 } else {
2001                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2002                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2003                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2004                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2005                 }
2006                 break;
2007         case 1:
2008                 if (skb) {
2009                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2010                         struct flow_keys keys;
2011
2012                         /* short-circuit if we already have L4 hash present */
2013                         if (skb->l4_hash)
2014                                 return skb_get_hash_raw(skb) >> 1;
2015
2016                         memset(&hash_keys, 0, sizeof(hash_keys));
2017
2018                         if (!flkeys) {
2019                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2020                                 flkeys = &keys;
2021                         }
2022                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2023                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2024                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2025                         hash_keys.ports.src = flkeys->ports.src;
2026                         hash_keys.ports.dst = flkeys->ports.dst;
2027                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2028                 } else {
2029                         memset(&hash_keys, 0, sizeof(hash_keys));
2030                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2031                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2032                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2033                         hash_keys.ports.src = fl6->fl6_sport;
2034                         hash_keys.ports.dst = fl6->fl6_dport;
2035                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2036                 }
2037                 break;
2038         }
2039         mhash = flow_hash_from_keys(&hash_keys);
2040
2041         return mhash >> 1;
2042 }
2043
2044 void ip6_route_input(struct sk_buff *skb)
2045 {
2046         const struct ipv6hdr *iph = ipv6_hdr(skb);
2047         struct net *net = dev_net(skb->dev);
2048         int flags = RT6_LOOKUP_F_HAS_SADDR;
2049         struct ip_tunnel_info *tun_info;
2050         struct flowi6 fl6 = {
2051                 .flowi6_iif = skb->dev->ifindex,
2052                 .daddr = iph->daddr,
2053                 .saddr = iph->saddr,
2054                 .flowlabel = ip6_flowinfo(iph),
2055                 .flowi6_mark = skb->mark,
2056                 .flowi6_proto = iph->nexthdr,
2057         };
2058         struct flow_keys *flkeys = NULL, _flkeys;
2059
2060         tun_info = skb_tunnel_info(skb);
2061         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2062                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2063
2064         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2065                 flkeys = &_flkeys;
2066
2067         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2068                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2069         skb_dst_drop(skb);
2070         skb_dst_set(skb,
2071                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2072 }
2073
2074 static struct rt6_info *ip6_pol_route_output(struct net *net,
2075                                              struct fib6_table *table,
2076                                              struct flowi6 *fl6,
2077                                              const struct sk_buff *skb,
2078                                              int flags)
2079 {
2080         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2081 }
2082
2083 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2084                                          struct flowi6 *fl6, int flags)
2085 {
2086         bool any_src;
2087
2088         if (ipv6_addr_type(&fl6->daddr) &
2089             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2090                 struct dst_entry *dst;
2091
2092                 dst = l3mdev_link_scope_lookup(net, fl6);
2093                 if (dst)
2094                         return dst;
2095         }
2096
2097         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2098
2099         any_src = ipv6_addr_any(&fl6->saddr);
2100         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2101             (fl6->flowi6_oif && any_src))
2102                 flags |= RT6_LOOKUP_F_IFACE;
2103
2104         if (!any_src)
2105                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2106         else if (sk)
2107                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2108
2109         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2110 }
2111 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2112
2113 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2114 {
2115         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2116         struct net_device *loopback_dev = net->loopback_dev;
2117         struct dst_entry *new = NULL;
2118
2119         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2120                        DST_OBSOLETE_DEAD, 0);
2121         if (rt) {
2122                 rt6_info_init(rt);
2123                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2124
2125                 new = &rt->dst;
2126                 new->__use = 1;
2127                 new->input = dst_discard;
2128                 new->output = dst_discard_out;
2129
2130                 dst_copy_metrics(new, &ort->dst);
2131
2132                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2133                 rt->rt6i_gateway = ort->rt6i_gateway;
2134                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2135
2136                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2137 #ifdef CONFIG_IPV6_SUBTREES
2138                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2139 #endif
2140         }
2141
2142         dst_release(dst_orig);
2143         return new ? new : ERR_PTR(-ENOMEM);
2144 }
2145
2146 /*
2147  *      Destination cache support functions
2148  */
2149
2150 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2151 {
2152         u32 rt_cookie = 0;
2153
2154         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2155                 return false;
2156
2157         if (fib6_check_expired(f6i))
2158                 return false;
2159
2160         return true;
2161 }
2162
2163 static struct dst_entry *rt6_check(struct rt6_info *rt,
2164                                    struct fib6_info *from,
2165                                    u32 cookie)
2166 {
2167         u32 rt_cookie = 0;
2168
2169         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2170             rt_cookie != cookie)
2171                 return NULL;
2172
2173         if (rt6_check_expired(rt))
2174                 return NULL;
2175
2176         return &rt->dst;
2177 }
2178
2179 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2180                                             struct fib6_info *from,
2181                                             u32 cookie)
2182 {
2183         if (!__rt6_check_expired(rt) &&
2184             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2185             fib6_check(from, cookie))
2186                 return &rt->dst;
2187         else
2188                 return NULL;
2189 }
2190
2191 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2192 {
2193         struct dst_entry *dst_ret;
2194         struct fib6_info *from;
2195         struct rt6_info *rt;
2196
2197         rt = container_of(dst, struct rt6_info, dst);
2198
2199         rcu_read_lock();
2200
2201         /* All IPV6 dsts are created with ->obsolete set to the value
2202          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2203          * into this function always.
2204          */
2205
2206         from = rcu_dereference(rt->from);
2207
2208         if (from && (rt->rt6i_flags & RTF_PCPU ||
2209             unlikely(!list_empty(&rt->rt6i_uncached))))
2210                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2211         else
2212                 dst_ret = rt6_check(rt, from, cookie);
2213
2214         rcu_read_unlock();
2215
2216         return dst_ret;
2217 }
2218
2219 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2220 {
2221         struct rt6_info *rt = (struct rt6_info *) dst;
2222
2223         if (rt) {
2224                 if (rt->rt6i_flags & RTF_CACHE) {
2225                         rcu_read_lock();
2226                         if (rt6_check_expired(rt)) {
2227                                 rt6_remove_exception_rt(rt);
2228                                 dst = NULL;
2229                         }
2230                         rcu_read_unlock();
2231                 } else {
2232                         dst_release(dst);
2233                         dst = NULL;
2234                 }
2235         }
2236         return dst;
2237 }
2238
2239 static void ip6_link_failure(struct sk_buff *skb)
2240 {
2241         struct rt6_info *rt;
2242
2243         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2244
2245         rt = (struct rt6_info *) skb_dst(skb);
2246         if (rt) {
2247                 rcu_read_lock();
2248                 if (rt->rt6i_flags & RTF_CACHE) {
2249                         rt6_remove_exception_rt(rt);
2250                 } else {
2251                         struct fib6_info *from;
2252                         struct fib6_node *fn;
2253
2254                         from = rcu_dereference(rt->from);
2255                         if (from) {
2256                                 fn = rcu_dereference(from->fib6_node);
2257                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2258                                         fn->fn_sernum = -1;
2259                         }
2260                 }
2261                 rcu_read_unlock();
2262         }
2263 }
2264
2265 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2266 {
2267         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2268                 struct fib6_info *from;
2269
2270                 rcu_read_lock();
2271                 from = rcu_dereference(rt0->from);
2272                 if (from)
2273                         rt0->dst.expires = from->expires;
2274                 rcu_read_unlock();
2275         }
2276
2277         dst_set_expires(&rt0->dst, timeout);
2278         rt0->rt6i_flags |= RTF_EXPIRES;
2279 }
2280
2281 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2282 {
2283         struct net *net = dev_net(rt->dst.dev);
2284
2285         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2286         rt->rt6i_flags |= RTF_MODIFIED;
2287         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2288 }
2289
2290 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2291 {
2292         bool from_set;
2293
2294         rcu_read_lock();
2295         from_set = !!rcu_dereference(rt->from);
2296         rcu_read_unlock();
2297
2298         return !(rt->rt6i_flags & RTF_CACHE) &&
2299                 (rt->rt6i_flags & RTF_PCPU || from_set);
2300 }
2301
2302 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2303                                  const struct ipv6hdr *iph, u32 mtu)
2304 {
2305         const struct in6_addr *daddr, *saddr;
2306         struct rt6_info *rt6 = (struct rt6_info *)dst;
2307
2308         if (dst_metric_locked(dst, RTAX_MTU))
2309                 return;
2310
2311         if (iph) {
2312                 daddr = &iph->daddr;
2313                 saddr = &iph->saddr;
2314         } else if (sk) {
2315                 daddr = &sk->sk_v6_daddr;
2316                 saddr = &inet6_sk(sk)->saddr;
2317         } else {
2318                 daddr = NULL;
2319                 saddr = NULL;
2320         }
2321         dst_confirm_neigh(dst, daddr);
2322         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2323         if (mtu >= dst_mtu(dst))
2324                 return;
2325
2326         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2327                 rt6_do_update_pmtu(rt6, mtu);
2328                 /* update rt6_ex->stamp for cache */
2329                 if (rt6->rt6i_flags & RTF_CACHE)
2330                         rt6_update_exception_stamp_rt(rt6);
2331         } else if (daddr) {
2332                 struct fib6_info *from;
2333                 struct rt6_info *nrt6;
2334
2335                 rcu_read_lock();
2336                 from = rcu_dereference(rt6->from);
2337                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2338                 if (nrt6) {
2339                         rt6_do_update_pmtu(nrt6, mtu);
2340                         if (rt6_insert_exception(nrt6, from))
2341                                 dst_release_immediate(&nrt6->dst);
2342                 }
2343                 rcu_read_unlock();
2344         }
2345 }
2346
2347 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2348                                struct sk_buff *skb, u32 mtu)
2349 {
2350         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2351 }
2352
2353 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2354                      int oif, u32 mark, kuid_t uid)
2355 {
2356         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2357         struct dst_entry *dst;
2358         struct flowi6 fl6 = {
2359                 .flowi6_oif = oif,
2360                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2361                 .daddr = iph->daddr,
2362                 .saddr = iph->saddr,
2363                 .flowlabel = ip6_flowinfo(iph),
2364                 .flowi6_uid = uid,
2365         };
2366
2367         dst = ip6_route_output(net, NULL, &fl6);
2368         if (!dst->error)
2369                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2370         dst_release(dst);
2371 }
2372 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2373
2374 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2375 {
2376         int oif = sk->sk_bound_dev_if;
2377         struct dst_entry *dst;
2378
2379         if (!oif && skb->dev)
2380                 oif = l3mdev_master_ifindex(skb->dev);
2381
2382         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2383
2384         dst = __sk_dst_get(sk);
2385         if (!dst || !dst->obsolete ||
2386             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2387                 return;
2388
2389         bh_lock_sock(sk);
2390         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2391                 ip6_datagram_dst_update(sk, false);
2392         bh_unlock_sock(sk);
2393 }
2394 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2395
2396 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2397                            const struct flowi6 *fl6)
2398 {
2399 #ifdef CONFIG_IPV6_SUBTREES
2400         struct ipv6_pinfo *np = inet6_sk(sk);
2401 #endif
2402
2403         ip6_dst_store(sk, dst,
2404                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2405                       &sk->sk_v6_daddr : NULL,
2406 #ifdef CONFIG_IPV6_SUBTREES
2407                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2408                       &np->saddr :
2409 #endif
2410                       NULL);
2411 }
2412
2413 /* Handle redirects */
2414 struct ip6rd_flowi {
2415         struct flowi6 fl6;
2416         struct in6_addr gateway;
2417 };
2418
2419 static struct rt6_info *__ip6_route_redirect(struct net *net,
2420                                              struct fib6_table *table,
2421                                              struct flowi6 *fl6,
2422                                              const struct sk_buff *skb,
2423                                              int flags)
2424 {
2425         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2426         struct rt6_info *ret = NULL, *rt_cache;
2427         struct fib6_info *rt;
2428         struct fib6_node *fn;
2429
2430         /* Get the "current" route for this destination and
2431          * check if the redirect has come from appropriate router.
2432          *
2433          * RFC 4861 specifies that redirects should only be
2434          * accepted if they come from the nexthop to the target.
2435          * Due to the way the routes are chosen, this notion
2436          * is a bit fuzzy and one might need to check all possible
2437          * routes.
2438          */
2439
2440         rcu_read_lock();
2441         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2442 restart:
2443         for_each_fib6_node_rt_rcu(fn) {
2444                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2445                         continue;
2446                 if (fib6_check_expired(rt))
2447                         continue;
2448                 if (rt->fib6_flags & RTF_REJECT)
2449                         break;
2450                 if (!(rt->fib6_flags & RTF_GATEWAY))
2451                         continue;
2452                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2453                         continue;
2454                 /* rt_cache's gateway might be different from its 'parent'
2455                  * in the case of an ip redirect.
2456                  * So we keep searching in the exception table if the gateway
2457                  * is different.
2458                  */
2459                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2460                         rt_cache = rt6_find_cached_rt(rt,
2461                                                       &fl6->daddr,
2462                                                       &fl6->saddr);
2463                         if (rt_cache &&
2464                             ipv6_addr_equal(&rdfl->gateway,
2465                                             &rt_cache->rt6i_gateway)) {
2466                                 ret = rt_cache;
2467                                 break;
2468                         }
2469                         continue;
2470                 }
2471                 break;
2472         }
2473
2474         if (!rt)
2475                 rt = net->ipv6.fib6_null_entry;
2476         else if (rt->fib6_flags & RTF_REJECT) {
2477                 ret = net->ipv6.ip6_null_entry;
2478                 goto out;
2479         }
2480
2481         if (rt == net->ipv6.fib6_null_entry) {
2482                 fn = fib6_backtrack(fn, &fl6->saddr);
2483                 if (fn)
2484                         goto restart;
2485         }
2486
2487 out:
2488         if (ret)
2489                 ip6_hold_safe(net, &ret, true);
2490         else
2491                 ret = ip6_create_rt_rcu(rt);
2492
2493         rcu_read_unlock();
2494
2495         trace_fib6_table_lookup(net, rt, table, fl6);
2496         return ret;
2497 };
2498
2499 static struct dst_entry *ip6_route_redirect(struct net *net,
2500                                             const struct flowi6 *fl6,
2501                                             const struct sk_buff *skb,
2502                                             const struct in6_addr *gateway)
2503 {
2504         int flags = RT6_LOOKUP_F_HAS_SADDR;
2505         struct ip6rd_flowi rdfl;
2506
2507         rdfl.fl6 = *fl6;
2508         rdfl.gateway = *gateway;
2509
2510         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2511                                 flags, __ip6_route_redirect);
2512 }
2513
2514 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2515                   kuid_t uid)
2516 {
2517         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2518         struct dst_entry *dst;
2519         struct flowi6 fl6 = {
2520                 .flowi6_iif = LOOPBACK_IFINDEX,
2521                 .flowi6_oif = oif,
2522                 .flowi6_mark = mark,
2523                 .daddr = iph->daddr,
2524                 .saddr = iph->saddr,
2525                 .flowlabel = ip6_flowinfo(iph),
2526                 .flowi6_uid = uid,
2527         };
2528
2529         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2530         rt6_do_redirect(dst, NULL, skb);
2531         dst_release(dst);
2532 }
2533 EXPORT_SYMBOL_GPL(ip6_redirect);
2534
2535 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2536 {
2537         const struct ipv6hdr *iph = ipv6_hdr(skb);
2538         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2539         struct dst_entry *dst;
2540         struct flowi6 fl6 = {
2541                 .flowi6_iif = LOOPBACK_IFINDEX,
2542                 .flowi6_oif = oif,
2543                 .daddr = msg->dest,
2544                 .saddr = iph->daddr,
2545                 .flowi6_uid = sock_net_uid(net, NULL),
2546         };
2547
2548         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2549         rt6_do_redirect(dst, NULL, skb);
2550         dst_release(dst);
2551 }
2552
2553 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2554 {
2555         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2556                      sk->sk_uid);
2557 }
2558 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2559
2560 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2561 {
2562         struct net_device *dev = dst->dev;
2563         unsigned int mtu = dst_mtu(dst);
2564         struct net *net = dev_net(dev);
2565
2566         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2567
2568         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2569                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2570
2571         /*
2572          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2573          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2574          * IPV6_MAXPLEN is also valid and means: "any MSS,
2575          * rely only on pmtu discovery"
2576          */
2577         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2578                 mtu = IPV6_MAXPLEN;
2579         return mtu;
2580 }
2581
2582 static unsigned int ip6_mtu(const struct dst_entry *dst)
2583 {
2584         struct inet6_dev *idev;
2585         unsigned int mtu;
2586
2587         mtu = dst_metric_raw(dst, RTAX_MTU);
2588         if (mtu)
2589                 goto out;
2590
2591         mtu = IPV6_MIN_MTU;
2592
2593         rcu_read_lock();
2594         idev = __in6_dev_get(dst->dev);
2595         if (idev)
2596                 mtu = idev->cnf.mtu6;
2597         rcu_read_unlock();
2598
2599 out:
2600         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2601
2602         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2603 }
2604
2605 /* MTU selection:
2606  * 1. mtu on route is locked - use it
2607  * 2. mtu from nexthop exception
2608  * 3. mtu from egress device
2609  *
2610  * based on ip6_dst_mtu_forward and exception logic of
2611  * rt6_find_cached_rt; called with rcu_read_lock
2612  */
2613 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2614                       struct in6_addr *saddr)
2615 {
2616         struct rt6_exception_bucket *bucket;
2617         struct rt6_exception *rt6_ex;
2618         struct in6_addr *src_key;
2619         struct inet6_dev *idev;
2620         u32 mtu = 0;
2621
2622         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2623                 mtu = f6i->fib6_pmtu;
2624                 if (mtu)
2625                         goto out;
2626         }
2627
2628         src_key = NULL;
2629 #ifdef CONFIG_IPV6_SUBTREES
2630         if (f6i->fib6_src.plen)
2631                 src_key = saddr;
2632 #endif
2633
2634         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2635         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2636         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2637                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2638
2639         if (likely(!mtu)) {
2640                 struct net_device *dev = fib6_info_nh_dev(f6i);
2641
2642                 mtu = IPV6_MIN_MTU;
2643                 idev = __in6_dev_get(dev);
2644                 if (idev && idev->cnf.mtu6 > mtu)
2645                         mtu = idev->cnf.mtu6;
2646         }
2647
2648         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2649 out:
2650         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2651 }
2652
2653 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2654                                   struct flowi6 *fl6)
2655 {
2656         struct dst_entry *dst;
2657         struct rt6_info *rt;
2658         struct inet6_dev *idev = in6_dev_get(dev);
2659         struct net *net = dev_net(dev);
2660
2661         if (unlikely(!idev))
2662                 return ERR_PTR(-ENODEV);
2663
2664         rt = ip6_dst_alloc(net, dev, 0);
2665         if (unlikely(!rt)) {
2666                 in6_dev_put(idev);
2667                 dst = ERR_PTR(-ENOMEM);
2668                 goto out;
2669         }
2670
2671         rt->dst.flags |= DST_HOST;
2672         rt->dst.input = ip6_input;
2673         rt->dst.output  = ip6_output;
2674         rt->rt6i_gateway  = fl6->daddr;
2675         rt->rt6i_dst.addr = fl6->daddr;
2676         rt->rt6i_dst.plen = 128;
2677         rt->rt6i_idev     = idev;
2678         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2679
2680         /* Add this dst into uncached_list so that rt6_disable_ip() can
2681          * do proper release of the net_device
2682          */
2683         rt6_uncached_list_add(rt);
2684         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2685
2686         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2687
2688 out:
2689         return dst;
2690 }
2691
2692 static int ip6_dst_gc(struct dst_ops *ops)
2693 {
2694         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2695         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2696         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2697         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2698         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2699         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2700         int entries;
2701
2702         entries = dst_entries_get_fast(ops);
2703         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2704             entries <= rt_max_size)
2705                 goto out;
2706
2707         net->ipv6.ip6_rt_gc_expire++;
2708         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2709         entries = dst_entries_get_slow(ops);
2710         if (entries < ops->gc_thresh)
2711                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2712 out:
2713         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2714         return entries > rt_max_size;
2715 }
2716
2717 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2718                                             struct fib6_config *cfg,
2719                                             const struct in6_addr *gw_addr,
2720                                             u32 tbid, int flags)
2721 {
2722         struct flowi6 fl6 = {
2723                 .flowi6_oif = cfg->fc_ifindex,
2724                 .daddr = *gw_addr,
2725                 .saddr = cfg->fc_prefsrc,
2726         };
2727         struct fib6_table *table;
2728         struct rt6_info *rt;
2729
2730         table = fib6_get_table(net, tbid);
2731         if (!table)
2732                 return NULL;
2733
2734         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2735                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2736
2737         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2738         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2739
2740         /* if table lookup failed, fall back to full lookup */
2741         if (rt == net->ipv6.ip6_null_entry) {
2742                 ip6_rt_put(rt);
2743                 rt = NULL;
2744         }
2745
2746         return rt;
2747 }
2748
2749 static int ip6_route_check_nh_onlink(struct net *net,
2750                                      struct fib6_config *cfg,
2751                                      const struct net_device *dev,
2752                                      struct netlink_ext_ack *extack)
2753 {
2754         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2755         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2756         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2757         struct fib6_info *from;
2758         struct rt6_info *grt;
2759         int err;
2760
2761         err = 0;
2762         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2763         if (grt) {
2764                 rcu_read_lock();
2765                 from = rcu_dereference(grt->from);
2766                 if (!grt->dst.error &&
2767                     /* ignore match if it is the default route */
2768                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2769                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2770                         NL_SET_ERR_MSG(extack,
2771                                        "Nexthop has invalid gateway or device mismatch");
2772                         err = -EINVAL;
2773                 }
2774                 rcu_read_unlock();
2775
2776                 ip6_rt_put(grt);
2777         }
2778
2779         return err;
2780 }
2781
2782 static int ip6_route_check_nh(struct net *net,
2783                               struct fib6_config *cfg,
2784                               struct net_device **_dev,
2785                               struct inet6_dev **idev)
2786 {
2787         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2788         struct net_device *dev = _dev ? *_dev : NULL;
2789         struct rt6_info *grt = NULL;
2790         int err = -EHOSTUNREACH;
2791
2792         if (cfg->fc_table) {
2793                 int flags = RT6_LOOKUP_F_IFACE;
2794
2795                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2796                                           cfg->fc_table, flags);
2797                 if (grt) {
2798                         if (grt->rt6i_flags & RTF_GATEWAY ||
2799                             (dev && dev != grt->dst.dev)) {
2800                                 ip6_rt_put(grt);
2801                                 grt = NULL;
2802                         }
2803                 }
2804         }
2805
2806         if (!grt)
2807                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2808
2809         if (!grt)
2810                 goto out;
2811
2812         if (dev) {
2813                 if (dev != grt->dst.dev) {
2814                         ip6_rt_put(grt);
2815                         goto out;
2816                 }
2817         } else {
2818                 *_dev = dev = grt->dst.dev;
2819                 *idev = grt->rt6i_idev;
2820                 dev_hold(dev);
2821                 in6_dev_hold(grt->rt6i_idev);
2822         }
2823
2824         if (!(grt->rt6i_flags & RTF_GATEWAY))
2825                 err = 0;
2826
2827         ip6_rt_put(grt);
2828
2829 out:
2830         return err;
2831 }
2832
2833 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2834                            struct net_device **_dev, struct inet6_dev **idev,
2835                            struct netlink_ext_ack *extack)
2836 {
2837         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2838         int gwa_type = ipv6_addr_type(gw_addr);
2839         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2840         const struct net_device *dev = *_dev;
2841         bool need_addr_check = !dev;
2842         int err = -EINVAL;
2843
2844         /* if gw_addr is local we will fail to detect this in case
2845          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2846          * will return already-added prefix route via interface that
2847          * prefix route was assigned to, which might be non-loopback.
2848          */
2849         if (dev &&
2850             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2851                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2852                 goto out;
2853         }
2854
2855         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2856                 /* IPv6 strictly inhibits using not link-local
2857                  * addresses as nexthop address.
2858                  * Otherwise, router will not able to send redirects.
2859                  * It is very good, but in some (rare!) circumstances
2860                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2861                  * some exceptions. --ANK
2862                  * We allow IPv4-mapped nexthops to support RFC4798-type
2863                  * addressing
2864                  */
2865                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2866                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2867                         goto out;
2868                 }
2869
2870                 if (cfg->fc_flags & RTNH_F_ONLINK)
2871                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2872                 else
2873                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2874
2875                 if (err)
2876                         goto out;
2877         }
2878
2879         /* reload in case device was changed */
2880         dev = *_dev;
2881
2882         err = -EINVAL;
2883         if (!dev) {
2884                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2885                 goto out;
2886         } else if (dev->flags & IFF_LOOPBACK) {
2887                 NL_SET_ERR_MSG(extack,
2888                                "Egress device can not be loopback device for this route");
2889                 goto out;
2890         }
2891
2892         /* if we did not check gw_addr above, do so now that the
2893          * egress device has been resolved.
2894          */
2895         if (need_addr_check &&
2896             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2897                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2898                 goto out;
2899         }
2900
2901         err = 0;
2902 out:
2903         return err;
2904 }
2905
2906 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2907                                               gfp_t gfp_flags,
2908                                               struct netlink_ext_ack *extack)
2909 {
2910         struct net *net = cfg->fc_nlinfo.nl_net;
2911         struct fib6_info *rt = NULL;
2912         struct net_device *dev = NULL;
2913         struct inet6_dev *idev = NULL;
2914         struct fib6_table *table;
2915         int addr_type;
2916         int err = -EINVAL;
2917
2918         /* RTF_PCPU is an internal flag; can not be set by userspace */
2919         if (cfg->fc_flags & RTF_PCPU) {
2920                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2921                 goto out;
2922         }
2923
2924         /* RTF_CACHE is an internal flag; can not be set by userspace */
2925         if (cfg->fc_flags & RTF_CACHE) {
2926                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2927                 goto out;
2928         }
2929
2930         if (cfg->fc_type > RTN_MAX) {
2931                 NL_SET_ERR_MSG(extack, "Invalid route type");
2932                 goto out;
2933         }
2934
2935         if (cfg->fc_dst_len > 128) {
2936                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2937                 goto out;
2938         }
2939         if (cfg->fc_src_len > 128) {
2940                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2941                 goto out;
2942         }
2943 #ifndef CONFIG_IPV6_SUBTREES
2944         if (cfg->fc_src_len) {
2945                 NL_SET_ERR_MSG(extack,
2946                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2947                 goto out;
2948         }
2949 #endif
2950         if (cfg->fc_ifindex) {
2951                 err = -ENODEV;
2952                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2953                 if (!dev)
2954                         goto out;
2955                 idev = in6_dev_get(dev);
2956                 if (!idev)
2957                         goto out;
2958         }
2959
2960         if (cfg->fc_metric == 0)
2961                 cfg->fc_metric = IP6_RT_PRIO_USER;
2962
2963         if (cfg->fc_flags & RTNH_F_ONLINK) {
2964                 if (!dev) {
2965                         NL_SET_ERR_MSG(extack,
2966                                        "Nexthop device required for onlink");
2967                         err = -ENODEV;
2968                         goto out;
2969                 }
2970
2971                 if (!(dev->flags & IFF_UP)) {
2972                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2973                         err = -ENETDOWN;
2974                         goto out;
2975                 }
2976         }
2977
2978         err = -ENOBUFS;
2979         if (cfg->fc_nlinfo.nlh &&
2980             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2981                 table = fib6_get_table(net, cfg->fc_table);
2982                 if (!table) {
2983                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2984                         table = fib6_new_table(net, cfg->fc_table);
2985                 }
2986         } else {
2987                 table = fib6_new_table(net, cfg->fc_table);
2988         }
2989
2990         if (!table)
2991                 goto out;
2992
2993         err = -ENOMEM;
2994         rt = fib6_info_alloc(gfp_flags);
2995         if (!rt)
2996                 goto out;
2997
2998         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2999                                                extack);
3000         if (IS_ERR(rt->fib6_metrics)) {
3001                 err = PTR_ERR(rt->fib6_metrics);
3002                 /* Do not leave garbage there. */
3003                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3004                 goto out;
3005         }
3006
3007         if (cfg->fc_flags & RTF_ADDRCONF)
3008                 rt->dst_nocount = true;
3009
3010         if (cfg->fc_flags & RTF_EXPIRES)
3011                 fib6_set_expires(rt, jiffies +
3012                                 clock_t_to_jiffies(cfg->fc_expires));
3013         else
3014                 fib6_clean_expires(rt);
3015
3016         if (cfg->fc_protocol == RTPROT_UNSPEC)
3017                 cfg->fc_protocol = RTPROT_BOOT;
3018         rt->fib6_protocol = cfg->fc_protocol;
3019
3020         addr_type = ipv6_addr_type(&cfg->fc_dst);
3021
3022         if (cfg->fc_encap) {
3023                 struct lwtunnel_state *lwtstate;
3024
3025                 err = lwtunnel_build_state(cfg->fc_encap_type,
3026                                            cfg->fc_encap, AF_INET6, cfg,
3027                                            &lwtstate, extack);
3028                 if (err)
3029                         goto out;
3030                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3031         }
3032
3033         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3034         rt->fib6_dst.plen = cfg->fc_dst_len;
3035         if (rt->fib6_dst.plen == 128)
3036                 rt->dst_host = true;
3037
3038 #ifdef CONFIG_IPV6_SUBTREES
3039         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3040         rt->fib6_src.plen = cfg->fc_src_len;
3041 #endif
3042
3043         rt->fib6_metric = cfg->fc_metric;
3044         rt->fib6_nh.nh_weight = 1;
3045
3046         rt->fib6_type = cfg->fc_type;
3047
3048         /* We cannot add true routes via loopback here,
3049            they would result in kernel looping; promote them to reject routes
3050          */
3051         if ((cfg->fc_flags & RTF_REJECT) ||
3052             (dev && (dev->flags & IFF_LOOPBACK) &&
3053              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3054              !(cfg->fc_flags & RTF_LOCAL))) {
3055                 /* hold loopback dev/idev if we haven't done so. */
3056                 if (dev != net->loopback_dev) {
3057                         if (dev) {
3058                                 dev_put(dev);
3059                                 in6_dev_put(idev);
3060                         }
3061                         dev = net->loopback_dev;
3062                         dev_hold(dev);
3063                         idev = in6_dev_get(dev);
3064                         if (!idev) {
3065                                 err = -ENODEV;
3066                                 goto out;
3067                         }
3068                 }
3069                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3070                 goto install_route;
3071         }
3072
3073         if (cfg->fc_flags & RTF_GATEWAY) {
3074                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3075                 if (err)
3076                         goto out;
3077
3078                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3079         }
3080
3081         err = -ENODEV;
3082         if (!dev)
3083                 goto out;
3084
3085         if (idev->cnf.disable_ipv6) {
3086                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3087                 err = -EACCES;
3088                 goto out;
3089         }
3090
3091         if (!(dev->flags & IFF_UP)) {
3092                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3093                 err = -ENETDOWN;
3094                 goto out;
3095         }
3096
3097         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3098                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3099                         NL_SET_ERR_MSG(extack, "Invalid source address");
3100                         err = -EINVAL;
3101                         goto out;
3102                 }
3103                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3104                 rt->fib6_prefsrc.plen = 128;
3105         } else
3106                 rt->fib6_prefsrc.plen = 0;
3107
3108         rt->fib6_flags = cfg->fc_flags;
3109
3110 install_route:
3111         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3112             !netif_carrier_ok(dev))
3113                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3114         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3115         rt->fib6_nh.nh_dev = dev;
3116         rt->fib6_table = table;
3117
3118         if (idev)
3119                 in6_dev_put(idev);
3120
3121         return rt;
3122 out:
3123         if (dev)
3124                 dev_put(dev);
3125         if (idev)
3126                 in6_dev_put(idev);
3127
3128         fib6_info_release(rt);
3129         return ERR_PTR(err);
3130 }
3131
3132 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3133                   struct netlink_ext_ack *extack)
3134 {
3135         struct fib6_info *rt;
3136         int err;
3137
3138         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3139         if (IS_ERR(rt))
3140                 return PTR_ERR(rt);
3141
3142         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3143         fib6_info_release(rt);
3144
3145         return err;
3146 }
3147
3148 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3149 {
3150         struct net *net = info->nl_net;
3151         struct fib6_table *table;
3152         int err;
3153
3154         if (rt == net->ipv6.fib6_null_entry) {
3155                 err = -ENOENT;
3156                 goto out;
3157         }
3158
3159         table = rt->fib6_table;
3160         spin_lock_bh(&table->tb6_lock);
3161         err = fib6_del(rt, info);
3162         spin_unlock_bh(&table->tb6_lock);
3163
3164 out:
3165         fib6_info_release(rt);
3166         return err;
3167 }
3168
3169 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3170 {
3171         struct nl_info info = { .nl_net = net };
3172
3173         return __ip6_del_rt(rt, &info);
3174 }
3175
3176 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3177 {
3178         struct nl_info *info = &cfg->fc_nlinfo;
3179         struct net *net = info->nl_net;
3180         struct sk_buff *skb = NULL;
3181         struct fib6_table *table;
3182         int err = -ENOENT;
3183
3184         if (rt == net->ipv6.fib6_null_entry)
3185                 goto out_put;
3186         table = rt->fib6_table;
3187         spin_lock_bh(&table->tb6_lock);
3188
3189         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3190                 struct fib6_info *sibling, *next_sibling;
3191
3192                 /* prefer to send a single notification with all hops */
3193                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3194                 if (skb) {
3195                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3196
3197                         if (rt6_fill_node(net, skb, rt, NULL,
3198                                           NULL, NULL, 0, RTM_DELROUTE,
3199                                           info->portid, seq, 0) < 0) {
3200                                 kfree_skb(skb);
3201                                 skb = NULL;
3202                         } else
3203                                 info->skip_notify = 1;
3204                 }
3205
3206                 list_for_each_entry_safe(sibling, next_sibling,
3207                                          &rt->fib6_siblings,
3208                                          fib6_siblings) {
3209                         err = fib6_del(sibling, info);
3210                         if (err)
3211                                 goto out_unlock;
3212                 }
3213         }
3214
3215         err = fib6_del(rt, info);
3216 out_unlock:
3217         spin_unlock_bh(&table->tb6_lock);
3218 out_put:
3219         fib6_info_release(rt);
3220
3221         if (skb) {
3222                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3223                             info->nlh, gfp_any());
3224         }
3225         return err;
3226 }
3227
3228 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3229 {
3230         int rc = -ESRCH;
3231
3232         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3233                 goto out;
3234
3235         if (cfg->fc_flags & RTF_GATEWAY &&
3236             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3237                 goto out;
3238
3239         rc = rt6_remove_exception_rt(rt);
3240 out:
3241         return rc;
3242 }
3243
3244 static int ip6_route_del(struct fib6_config *cfg,
3245                          struct netlink_ext_ack *extack)
3246 {
3247         struct rt6_info *rt_cache;
3248         struct fib6_table *table;
3249         struct fib6_info *rt;
3250         struct fib6_node *fn;
3251         int err = -ESRCH;
3252
3253         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3254         if (!table) {
3255                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3256                 return err;
3257         }
3258
3259         rcu_read_lock();
3260
3261         fn = fib6_locate(&table->tb6_root,
3262                          &cfg->fc_dst, cfg->fc_dst_len,
3263                          &cfg->fc_src, cfg->fc_src_len,
3264                          !(cfg->fc_flags & RTF_CACHE));
3265
3266         if (fn) {
3267                 for_each_fib6_node_rt_rcu(fn) {
3268                         if (cfg->fc_flags & RTF_CACHE) {
3269                                 int rc;
3270
3271                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3272                                                               &cfg->fc_src);
3273                                 if (rt_cache) {
3274                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3275                                         if (rc != -ESRCH) {
3276                                                 rcu_read_unlock();
3277                                                 return rc;
3278                                         }
3279                                 }
3280                                 continue;
3281                         }
3282                         if (cfg->fc_ifindex &&
3283                             (!rt->fib6_nh.nh_dev ||
3284                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3285                                 continue;
3286                         if (cfg->fc_flags & RTF_GATEWAY &&
3287                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3288                                 continue;
3289                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3290                                 continue;
3291                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3292                                 continue;
3293                         if (!fib6_info_hold_safe(rt))
3294                                 continue;
3295                         rcu_read_unlock();
3296
3297                         /* if gateway was specified only delete the one hop */
3298                         if (cfg->fc_flags & RTF_GATEWAY)
3299                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3300
3301                         return __ip6_del_rt_siblings(rt, cfg);
3302                 }
3303         }
3304         rcu_read_unlock();
3305
3306         return err;
3307 }
3308
3309 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3310 {
3311         struct netevent_redirect netevent;
3312         struct rt6_info *rt, *nrt = NULL;
3313         struct ndisc_options ndopts;
3314         struct inet6_dev *in6_dev;
3315         struct neighbour *neigh;
3316         struct fib6_info *from;
3317         struct rd_msg *msg;
3318         int optlen, on_link;
3319         u8 *lladdr;
3320
3321         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3322         optlen -= sizeof(*msg);
3323
3324         if (optlen < 0) {
3325                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3326                 return;
3327         }
3328
3329         msg = (struct rd_msg *)icmp6_hdr(skb);
3330
3331         if (ipv6_addr_is_multicast(&msg->dest)) {
3332                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3333                 return;
3334         }
3335
3336         on_link = 0;
3337         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3338                 on_link = 1;
3339         } else if (ipv6_addr_type(&msg->target) !=
3340                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3341                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3342                 return;
3343         }
3344
3345         in6_dev = __in6_dev_get(skb->dev);
3346         if (!in6_dev)
3347                 return;
3348         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3349                 return;
3350
3351         /* RFC2461 8.1:
3352          *      The IP source address of the Redirect MUST be the same as the current
3353          *      first-hop router for the specified ICMP Destination Address.
3354          */
3355
3356         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3357                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3358                 return;
3359         }
3360
3361         lladdr = NULL;
3362         if (ndopts.nd_opts_tgt_lladdr) {
3363                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3364                                              skb->dev);
3365                 if (!lladdr) {
3366                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3367                         return;
3368                 }
3369         }
3370
3371         rt = (struct rt6_info *) dst;
3372         if (rt->rt6i_flags & RTF_REJECT) {
3373                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3374                 return;
3375         }
3376
3377         /* Redirect received -> path was valid.
3378          * Look, redirects are sent only in response to data packets,
3379          * so that this nexthop apparently is reachable. --ANK
3380          */
3381         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3382
3383         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3384         if (!neigh)
3385                 return;
3386
3387         /*
3388          *      We have finally decided to accept it.
3389          */
3390
3391         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3392                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3393                      NEIGH_UPDATE_F_OVERRIDE|
3394                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3395                                      NEIGH_UPDATE_F_ISROUTER)),
3396                      NDISC_REDIRECT, &ndopts);
3397
3398         rcu_read_lock();
3399         from = rcu_dereference(rt->from);
3400         /* This fib6_info_hold() is safe here because we hold reference to rt
3401          * and rt already holds reference to fib6_info.
3402          */
3403         fib6_info_hold(from);
3404         rcu_read_unlock();
3405
3406         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3407         if (!nrt)
3408                 goto out;
3409
3410         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3411         if (on_link)
3412                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3413
3414         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3415
3416         /* No need to remove rt from the exception table if rt is
3417          * a cached route because rt6_insert_exception() will
3418          * takes care of it
3419          */
3420         if (rt6_insert_exception(nrt, from)) {
3421                 dst_release_immediate(&nrt->dst);
3422                 goto out;
3423         }
3424
3425         netevent.old = &rt->dst;
3426         netevent.new = &nrt->dst;
3427         netevent.daddr = &msg->dest;
3428         netevent.neigh = neigh;
3429         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3430
3431 out:
3432         fib6_info_release(from);
3433         neigh_release(neigh);
3434 }
3435
3436 #ifdef CONFIG_IPV6_ROUTE_INFO
3437 static struct fib6_info *rt6_get_route_info(struct net *net,
3438                                            const struct in6_addr *prefix, int prefixlen,
3439                                            const struct in6_addr *gwaddr,
3440                                            struct net_device *dev)
3441 {
3442         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3443         int ifindex = dev->ifindex;
3444         struct fib6_node *fn;
3445         struct fib6_info *rt = NULL;
3446         struct fib6_table *table;
3447
3448         table = fib6_get_table(net, tb_id);
3449         if (!table)
3450                 return NULL;
3451
3452         rcu_read_lock();
3453         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3454         if (!fn)
3455                 goto out;
3456
3457         for_each_fib6_node_rt_rcu(fn) {
3458                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3459                         continue;
3460                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3461                         continue;
3462                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3463                         continue;
3464                 if (!fib6_info_hold_safe(rt))
3465                         continue;
3466                 break;
3467         }
3468 out:
3469         rcu_read_unlock();
3470         return rt;
3471 }
3472
3473 static struct fib6_info *rt6_add_route_info(struct net *net,
3474                                            const struct in6_addr *prefix, int prefixlen,
3475                                            const struct in6_addr *gwaddr,
3476                                            struct net_device *dev,
3477                                            unsigned int pref)
3478 {
3479         struct fib6_config cfg = {
3480                 .fc_metric      = IP6_RT_PRIO_USER,
3481                 .fc_ifindex     = dev->ifindex,
3482                 .fc_dst_len     = prefixlen,
3483                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3484                                   RTF_UP | RTF_PREF(pref),
3485                 .fc_protocol = RTPROT_RA,
3486                 .fc_type = RTN_UNICAST,
3487                 .fc_nlinfo.portid = 0,
3488                 .fc_nlinfo.nlh = NULL,
3489                 .fc_nlinfo.nl_net = net,
3490         };
3491
3492         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3493         cfg.fc_dst = *prefix;
3494         cfg.fc_gateway = *gwaddr;
3495
3496         /* We should treat it as a default route if prefix length is 0. */
3497         if (!prefixlen)
3498                 cfg.fc_flags |= RTF_DEFAULT;
3499
3500         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3501
3502         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3503 }
3504 #endif
3505
3506 struct fib6_info *rt6_get_dflt_router(struct net *net,
3507                                      const struct in6_addr *addr,
3508                                      struct net_device *dev)
3509 {
3510         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3511         struct fib6_info *rt;
3512         struct fib6_table *table;
3513
3514         table = fib6_get_table(net, tb_id);
3515         if (!table)
3516                 return NULL;
3517
3518         rcu_read_lock();
3519         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3520                 if (dev == rt->fib6_nh.nh_dev &&
3521                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3522                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3523                         break;
3524         }
3525         if (rt && !fib6_info_hold_safe(rt))
3526                 rt = NULL;
3527         rcu_read_unlock();
3528         return rt;
3529 }
3530
3531 struct fib6_info *rt6_add_dflt_router(struct net *net,
3532                                      const struct in6_addr *gwaddr,
3533                                      struct net_device *dev,
3534                                      unsigned int pref)
3535 {
3536         struct fib6_config cfg = {
3537                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3538                 .fc_metric      = IP6_RT_PRIO_USER,
3539                 .fc_ifindex     = dev->ifindex,
3540                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3541                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3542                 .fc_protocol = RTPROT_RA,
3543                 .fc_type = RTN_UNICAST,
3544                 .fc_nlinfo.portid = 0,
3545                 .fc_nlinfo.nlh = NULL,
3546                 .fc_nlinfo.nl_net = net,
3547         };
3548
3549         cfg.fc_gateway = *gwaddr;
3550
3551         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3552                 struct fib6_table *table;
3553
3554                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3555                 if (table)
3556                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3557         }
3558
3559         return rt6_get_dflt_router(net, gwaddr, dev);
3560 }
3561
3562 static void __rt6_purge_dflt_routers(struct net *net,
3563                                      struct fib6_table *table)
3564 {
3565         struct fib6_info *rt;
3566
3567 restart:
3568         rcu_read_lock();
3569         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3570                 struct net_device *dev = fib6_info_nh_dev(rt);
3571                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3572
3573                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3574                     (!idev || idev->cnf.accept_ra != 2) &&
3575                     fib6_info_hold_safe(rt)) {
3576                         rcu_read_unlock();
3577                         ip6_del_rt(net, rt);
3578                         goto restart;
3579                 }
3580         }
3581         rcu_read_unlock();
3582
3583         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3584 }
3585
3586 void rt6_purge_dflt_routers(struct net *net)
3587 {
3588         struct fib6_table *table;
3589         struct hlist_head *head;
3590         unsigned int h;
3591
3592         rcu_read_lock();
3593
3594         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3595                 head = &net->ipv6.fib_table_hash[h];
3596                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3597                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3598                                 __rt6_purge_dflt_routers(net, table);
3599                 }
3600         }
3601
3602         rcu_read_unlock();
3603 }
3604
3605 static void rtmsg_to_fib6_config(struct net *net,
3606                                  struct in6_rtmsg *rtmsg,
3607                                  struct fib6_config *cfg)
3608 {
3609         *cfg = (struct fib6_config){
3610                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3611                          : RT6_TABLE_MAIN,
3612                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3613                 .fc_metric = rtmsg->rtmsg_metric,
3614                 .fc_expires = rtmsg->rtmsg_info,
3615                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3616                 .fc_src_len = rtmsg->rtmsg_src_len,
3617                 .fc_flags = rtmsg->rtmsg_flags,
3618                 .fc_type = rtmsg->rtmsg_type,
3619
3620                 .fc_nlinfo.nl_net = net,
3621
3622                 .fc_dst = rtmsg->rtmsg_dst,
3623                 .fc_src = rtmsg->rtmsg_src,
3624                 .fc_gateway = rtmsg->rtmsg_gateway,
3625         };
3626 }
3627
3628 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3629 {
3630         struct fib6_config cfg;
3631         struct in6_rtmsg rtmsg;
3632         int err;
3633
3634         switch (cmd) {
3635         case SIOCADDRT:         /* Add a route */
3636         case SIOCDELRT:         /* Delete a route */
3637                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3638                         return -EPERM;
3639                 err = copy_from_user(&rtmsg, arg,
3640                                      sizeof(struct in6_rtmsg));
3641                 if (err)
3642                         return -EFAULT;
3643
3644                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3645
3646                 rtnl_lock();
3647                 switch (cmd) {
3648                 case SIOCADDRT:
3649                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3650                         break;
3651                 case SIOCDELRT:
3652                         err = ip6_route_del(&cfg, NULL);
3653                         break;
3654                 default:
3655                         err = -EINVAL;
3656                 }
3657                 rtnl_unlock();
3658
3659                 return err;
3660         }
3661
3662         return -EINVAL;
3663 }
3664
3665 /*
3666  *      Drop the packet on the floor
3667  */
3668
3669 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3670 {
3671         int type;
3672         struct dst_entry *dst = skb_dst(skb);
3673         switch (ipstats_mib_noroutes) {
3674         case IPSTATS_MIB_INNOROUTES:
3675                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3676                 if (type == IPV6_ADDR_ANY) {
3677                         IP6_INC_STATS(dev_net(dst->dev),
3678                                       __in6_dev_get_safely(skb->dev),
3679                                       IPSTATS_MIB_INADDRERRORS);
3680                         break;
3681                 }
3682                 /* FALLTHROUGH */
3683         case IPSTATS_MIB_OUTNOROUTES:
3684                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3685                               ipstats_mib_noroutes);
3686                 break;
3687         }
3688         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3689         kfree_skb(skb);
3690         return 0;
3691 }
3692
3693 static int ip6_pkt_discard(struct sk_buff *skb)
3694 {
3695         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3696 }
3697
3698 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3699 {
3700         skb->dev = skb_dst(skb)->dev;
3701         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3702 }
3703
3704 static int ip6_pkt_prohibit(struct sk_buff *skb)
3705 {
3706         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3707 }
3708
3709 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3710 {
3711         skb->dev = skb_dst(skb)->dev;
3712         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3713 }
3714
3715 /*
3716  *      Allocate a dst for local (unicast / anycast) address.
3717  */
3718
3719 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3720                                      struct inet6_dev *idev,
3721                                      const struct in6_addr *addr,
3722                                      bool anycast, gfp_t gfp_flags)
3723 {
3724         u32 tb_id;
3725         struct net_device *dev = idev->dev;
3726         struct fib6_info *f6i;
3727
3728         f6i = fib6_info_alloc(gfp_flags);
3729         if (!f6i)
3730                 return ERR_PTR(-ENOMEM);
3731
3732         f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3733         f6i->dst_nocount = true;
3734         f6i->dst_host = true;
3735         f6i->fib6_protocol = RTPROT_KERNEL;
3736         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3737         if (anycast) {
3738                 f6i->fib6_type = RTN_ANYCAST;
3739                 f6i->fib6_flags |= RTF_ANYCAST;
3740         } else {
3741                 f6i->fib6_type = RTN_LOCAL;
3742                 f6i->fib6_flags |= RTF_LOCAL;
3743         }
3744
3745         f6i->fib6_nh.nh_gw = *addr;
3746         dev_hold(dev);
3747         f6i->fib6_nh.nh_dev = dev;
3748         f6i->fib6_dst.addr = *addr;
3749         f6i->fib6_dst.plen = 128;
3750         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3751         f6i->fib6_table = fib6_get_table(net, tb_id);
3752
3753         return f6i;
3754 }
3755
3756 /* remove deleted ip from prefsrc entries */
3757 struct arg_dev_net_ip {
3758         struct net_device *dev;
3759         struct net *net;
3760         struct in6_addr *addr;
3761 };
3762
3763 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3764 {
3765         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3766         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3767         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3768
3769         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3770             rt != net->ipv6.fib6_null_entry &&
3771             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3772                 spin_lock_bh(&rt6_exception_lock);
3773                 /* remove prefsrc entry */
3774                 rt->fib6_prefsrc.plen = 0;
3775                 spin_unlock_bh(&rt6_exception_lock);
3776         }
3777         return 0;
3778 }
3779
3780 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3781 {
3782         struct net *net = dev_net(ifp->idev->dev);
3783         struct arg_dev_net_ip adni = {
3784                 .dev = ifp->idev->dev,
3785                 .net = net,
3786                 .addr = &ifp->addr,
3787         };
3788         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3789 }
3790
3791 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3792
3793 /* Remove routers and update dst entries when gateway turn into host. */
3794 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3795 {
3796         struct in6_addr *gateway = (struct in6_addr *)arg;
3797
3798         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3799             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3800                 return -1;
3801         }
3802
3803         /* Further clean up cached routes in exception table.
3804          * This is needed because cached route may have a different
3805          * gateway than its 'parent' in the case of an ip redirect.
3806          */
3807         rt6_exceptions_clean_tohost(rt, gateway);
3808
3809         return 0;
3810 }
3811
3812 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3813 {
3814         fib6_clean_all(net, fib6_clean_tohost, gateway);
3815 }
3816
3817 struct arg_netdev_event {
3818         const struct net_device *dev;
3819         union {
3820                 unsigned int nh_flags;
3821                 unsigned long event;
3822         };
3823 };
3824
3825 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3826 {
3827         struct fib6_info *iter;
3828         struct fib6_node *fn;
3829
3830         fn = rcu_dereference_protected(rt->fib6_node,
3831                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3832         iter = rcu_dereference_protected(fn->leaf,
3833                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3834         while (iter) {
3835                 if (iter->fib6_metric == rt->fib6_metric &&
3836                     rt6_qualify_for_ecmp(iter))
3837                         return iter;
3838                 iter = rcu_dereference_protected(iter->fib6_next,
3839                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3840         }
3841
3842         return NULL;
3843 }
3844
3845 static bool rt6_is_dead(const struct fib6_info *rt)
3846 {
3847         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3848             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3849              fib6_ignore_linkdown(rt)))
3850                 return true;
3851
3852         return false;
3853 }
3854
3855 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3856 {
3857         struct fib6_info *iter;
3858         int total = 0;
3859
3860         if (!rt6_is_dead(rt))
3861                 total += rt->fib6_nh.nh_weight;
3862
3863         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3864                 if (!rt6_is_dead(iter))
3865                         total += iter->fib6_nh.nh_weight;
3866         }
3867
3868         return total;
3869 }
3870
3871 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3872 {
3873         int upper_bound = -1;
3874
3875         if (!rt6_is_dead(rt)) {
3876                 *weight += rt->fib6_nh.nh_weight;
3877                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3878                                                     total) - 1;
3879         }
3880         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3881 }
3882
3883 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3884 {
3885         struct fib6_info *iter;
3886         int weight = 0;
3887
3888         rt6_upper_bound_set(rt, &weight, total);
3889
3890         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3891                 rt6_upper_bound_set(iter, &weight, total);
3892 }
3893
3894 void rt6_multipath_rebalance(struct fib6_info *rt)
3895 {
3896         struct fib6_info *first;
3897         int total;
3898
3899         /* In case the entire multipath route was marked for flushing,
3900          * then there is no need to rebalance upon the removal of every
3901          * sibling route.
3902          */
3903         if (!rt->fib6_nsiblings || rt->should_flush)
3904                 return;
3905
3906         /* During lookup routes are evaluated in order, so we need to
3907          * make sure upper bounds are assigned from the first sibling
3908          * onwards.
3909          */
3910         first = rt6_multipath_first_sibling(rt);
3911         if (WARN_ON_ONCE(!first))
3912                 return;
3913
3914         total = rt6_multipath_total_weight(first);
3915         rt6_multipath_upper_bound_set(first, total);
3916 }
3917
3918 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3919 {
3920         const struct arg_netdev_event *arg = p_arg;
3921         struct net *net = dev_net(arg->dev);
3922
3923         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3924                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3925                 fib6_update_sernum_upto_root(net, rt);
3926                 rt6_multipath_rebalance(rt);
3927         }
3928
3929         return 0;
3930 }
3931
3932 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3933 {
3934         struct arg_netdev_event arg = {
3935                 .dev = dev,
3936                 {
3937                         .nh_flags = nh_flags,
3938                 },
3939         };
3940
3941         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3942                 arg.nh_flags |= RTNH_F_LINKDOWN;
3943
3944         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3945 }
3946
3947 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3948                                    const struct net_device *dev)
3949 {
3950         struct fib6_info *iter;
3951
3952         if (rt->fib6_nh.nh_dev == dev)
3953                 return true;
3954         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3955                 if (iter->fib6_nh.nh_dev == dev)
3956                         return true;
3957
3958         return false;
3959 }
3960
3961 static void rt6_multipath_flush(struct fib6_info *rt)
3962 {
3963         struct fib6_info *iter;
3964
3965         rt->should_flush = 1;
3966         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3967                 iter->should_flush = 1;
3968 }
3969
3970 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3971                                              const struct net_device *down_dev)
3972 {
3973         struct fib6_info *iter;
3974         unsigned int dead = 0;
3975
3976         if (rt->fib6_nh.nh_dev == down_dev ||
3977             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3978                 dead++;
3979         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3980                 if (iter->fib6_nh.nh_dev == down_dev ||
3981                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3982                         dead++;
3983
3984         return dead;
3985 }
3986
3987 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3988                                        const struct net_device *dev,
3989                                        unsigned int nh_flags)
3990 {
3991         struct fib6_info *iter;
3992
3993         if (rt->fib6_nh.nh_dev == dev)
3994                 rt->fib6_nh.nh_flags |= nh_flags;
3995         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3996                 if (iter->fib6_nh.nh_dev == dev)
3997                         iter->fib6_nh.nh_flags |= nh_flags;
3998 }
3999
4000 /* called with write lock held for table with rt */
4001 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4002 {
4003         const struct arg_netdev_event *arg = p_arg;
4004         const struct net_device *dev = arg->dev;
4005         struct net *net = dev_net(dev);
4006
4007         if (rt == net->ipv6.fib6_null_entry)
4008                 return 0;
4009
4010         switch (arg->event) {
4011         case NETDEV_UNREGISTER:
4012                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4013         case NETDEV_DOWN:
4014                 if (rt->should_flush)
4015                         return -1;
4016                 if (!rt->fib6_nsiblings)
4017                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4018                 if (rt6_multipath_uses_dev(rt, dev)) {
4019                         unsigned int count;
4020
4021                         count = rt6_multipath_dead_count(rt, dev);
4022                         if (rt->fib6_nsiblings + 1 == count) {
4023                                 rt6_multipath_flush(rt);
4024                                 return -1;
4025                         }
4026                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4027                                                    RTNH_F_LINKDOWN);
4028                         fib6_update_sernum(net, rt);
4029                         rt6_multipath_rebalance(rt);
4030                 }
4031                 return -2;
4032         case NETDEV_CHANGE:
4033                 if (rt->fib6_nh.nh_dev != dev ||
4034                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4035                         break;
4036                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4037                 rt6_multipath_rebalance(rt);
4038                 break;
4039         }
4040
4041         return 0;
4042 }
4043
4044 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4045 {
4046         struct arg_netdev_event arg = {
4047                 .dev = dev,
4048                 {
4049                         .event = event,
4050                 },
4051         };
4052         struct net *net = dev_net(dev);
4053
4054         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4055                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4056         else
4057                 fib6_clean_all(net, fib6_ifdown, &arg);
4058 }
4059
4060 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4061 {
4062         rt6_sync_down_dev(dev, event);
4063         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4064         neigh_ifdown(&nd_tbl, dev);
4065 }
4066
4067 struct rt6_mtu_change_arg {
4068         struct net_device *dev;
4069         unsigned int mtu;
4070 };
4071
4072 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4073 {
4074         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4075         struct inet6_dev *idev;
4076
4077         /* In IPv6 pmtu discovery is not optional,
4078            so that RTAX_MTU lock cannot disable it.
4079            We still use this lock to block changes
4080            caused by addrconf/ndisc.
4081         */
4082
4083         idev = __in6_dev_get(arg->dev);
4084         if (!idev)
4085                 return 0;
4086
4087         /* For administrative MTU increase, there is no way to discover
4088            IPv6 PMTU increase, so PMTU increase should be updated here.
4089            Since RFC 1981 doesn't include administrative MTU increase
4090            update PMTU increase is a MUST. (i.e. jumbo frame)
4091          */
4092         if (rt->fib6_nh.nh_dev == arg->dev &&
4093             !fib6_metric_locked(rt, RTAX_MTU)) {
4094                 u32 mtu = rt->fib6_pmtu;
4095
4096                 if (mtu >= arg->mtu ||
4097                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4098                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4099
4100                 spin_lock_bh(&rt6_exception_lock);
4101                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4102                 spin_unlock_bh(&rt6_exception_lock);
4103         }
4104         return 0;
4105 }
4106
4107 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4108 {
4109         struct rt6_mtu_change_arg arg = {
4110                 .dev = dev,
4111                 .mtu = mtu,
4112         };
4113
4114         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4115 }
4116
4117 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4118         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4119         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4120         [RTA_OIF]               = { .type = NLA_U32 },
4121         [RTA_IIF]               = { .type = NLA_U32 },
4122         [RTA_PRIORITY]          = { .type = NLA_U32 },
4123         [RTA_METRICS]           = { .type = NLA_NESTED },
4124         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4125         [RTA_PREF]              = { .type = NLA_U8 },
4126         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4127         [RTA_ENCAP]             = { .type = NLA_NESTED },
4128         [RTA_EXPIRES]           = { .type = NLA_U32 },
4129         [RTA_UID]               = { .type = NLA_U32 },
4130         [RTA_MARK]              = { .type = NLA_U32 },
4131         [RTA_TABLE]             = { .type = NLA_U32 },
4132         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4133         [RTA_SPORT]             = { .type = NLA_U16 },
4134         [RTA_DPORT]             = { .type = NLA_U16 },
4135 };
4136
4137 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4138                               struct fib6_config *cfg,
4139                               struct netlink_ext_ack *extack)
4140 {
4141         struct rtmsg *rtm;
4142         struct nlattr *tb[RTA_MAX+1];
4143         unsigned int pref;
4144         int err;
4145
4146         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4147                           extack);
4148         if (err < 0)
4149                 goto errout;
4150
4151         err = -EINVAL;
4152         rtm = nlmsg_data(nlh);
4153
4154         *cfg = (struct fib6_config){
4155                 .fc_table = rtm->rtm_table,
4156                 .fc_dst_len = rtm->rtm_dst_len,
4157                 .fc_src_len = rtm->rtm_src_len,
4158                 .fc_flags = RTF_UP,
4159                 .fc_protocol = rtm->rtm_protocol,
4160                 .fc_type = rtm->rtm_type,
4161
4162                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4163                 .fc_nlinfo.nlh = nlh,
4164                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4165         };
4166
4167         if (rtm->rtm_type == RTN_UNREACHABLE ||
4168             rtm->rtm_type == RTN_BLACKHOLE ||
4169             rtm->rtm_type == RTN_PROHIBIT ||
4170             rtm->rtm_type == RTN_THROW)
4171                 cfg->fc_flags |= RTF_REJECT;
4172
4173         if (rtm->rtm_type == RTN_LOCAL)
4174                 cfg->fc_flags |= RTF_LOCAL;
4175
4176         if (rtm->rtm_flags & RTM_F_CLONED)
4177                 cfg->fc_flags |= RTF_CACHE;
4178
4179         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4180
4181         if (tb[RTA_GATEWAY]) {
4182                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4183                 cfg->fc_flags |= RTF_GATEWAY;
4184         }
4185
4186         if (tb[RTA_DST]) {
4187                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4188
4189                 if (nla_len(tb[RTA_DST]) < plen)
4190                         goto errout;
4191
4192                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4193         }
4194
4195         if (tb[RTA_SRC]) {
4196                 int plen = (rtm->rtm_src_len + 7) >> 3;
4197
4198                 if (nla_len(tb[RTA_SRC]) < plen)
4199                         goto errout;
4200
4201                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4202         }
4203
4204         if (tb[RTA_PREFSRC])
4205                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4206
4207         if (tb[RTA_OIF])
4208                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4209
4210         if (tb[RTA_PRIORITY])
4211                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4212
4213         if (tb[RTA_METRICS]) {
4214                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4215                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4216         }
4217
4218         if (tb[RTA_TABLE])
4219                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4220
4221         if (tb[RTA_MULTIPATH]) {
4222                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4223                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4224
4225                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4226                                                      cfg->fc_mp_len, extack);
4227                 if (err < 0)
4228                         goto errout;
4229         }
4230
4231         if (tb[RTA_PREF]) {
4232                 pref = nla_get_u8(tb[RTA_PREF]);
4233                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4234                     pref != ICMPV6_ROUTER_PREF_HIGH)
4235                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4236                 cfg->fc_flags |= RTF_PREF(pref);
4237         }
4238
4239         if (tb[RTA_ENCAP])
4240                 cfg->fc_encap = tb[RTA_ENCAP];
4241
4242         if (tb[RTA_ENCAP_TYPE]) {
4243                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4244
4245                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4246                 if (err < 0)
4247                         goto errout;
4248         }
4249
4250         if (tb[RTA_EXPIRES]) {
4251                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4252
4253                 if (addrconf_finite_timeout(timeout)) {
4254                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4255                         cfg->fc_flags |= RTF_EXPIRES;
4256                 }
4257         }
4258
4259         err = 0;
4260 errout:
4261         return err;
4262 }
4263
4264 struct rt6_nh {
4265         struct fib6_info *fib6_info;
4266         struct fib6_config r_cfg;
4267         struct list_head next;
4268 };
4269
4270 static int ip6_route_info_append(struct net *net,
4271                                  struct list_head *rt6_nh_list,
4272                                  struct fib6_info *rt,
4273                                  struct fib6_config *r_cfg)
4274 {
4275         struct rt6_nh *nh;
4276         int err = -EEXIST;
4277
4278         list_for_each_entry(nh, rt6_nh_list, next) {
4279                 /* check if fib6_info already exists */
4280                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4281                         return err;
4282         }
4283
4284         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4285         if (!nh)
4286                 return -ENOMEM;
4287         nh->fib6_info = rt;
4288         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4289         list_add_tail(&nh->next, rt6_nh_list);
4290
4291         return 0;
4292 }
4293
4294 static void ip6_route_mpath_notify(struct fib6_info *rt,
4295                                    struct fib6_info *rt_last,
4296                                    struct nl_info *info,
4297                                    __u16 nlflags)
4298 {
4299         /* if this is an APPEND route, then rt points to the first route
4300          * inserted and rt_last points to last route inserted. Userspace
4301          * wants a consistent dump of the route which starts at the first
4302          * nexthop. Since sibling routes are always added at the end of
4303          * the list, find the first sibling of the last route appended
4304          */
4305         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4306                 rt = list_first_entry(&rt_last->fib6_siblings,
4307                                       struct fib6_info,
4308                                       fib6_siblings);
4309         }
4310
4311         if (rt)
4312                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4313 }
4314
4315 static int ip6_route_multipath_add(struct fib6_config *cfg,
4316                                    struct netlink_ext_ack *extack)
4317 {
4318         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4319         struct nl_info *info = &cfg->fc_nlinfo;
4320         struct fib6_config r_cfg;
4321         struct rtnexthop *rtnh;
4322         struct fib6_info *rt;
4323         struct rt6_nh *err_nh;
4324         struct rt6_nh *nh, *nh_safe;
4325         __u16 nlflags;
4326         int remaining;
4327         int attrlen;
4328         int err = 1;
4329         int nhn = 0;
4330         int replace = (cfg->fc_nlinfo.nlh &&
4331                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4332         LIST_HEAD(rt6_nh_list);
4333
4334         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4335         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4336                 nlflags |= NLM_F_APPEND;
4337
4338         remaining = cfg->fc_mp_len;
4339         rtnh = (struct rtnexthop *)cfg->fc_mp;
4340
4341         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4342          * fib6_info structs per nexthop
4343          */
4344         while (rtnh_ok(rtnh, remaining)) {
4345                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4346                 if (rtnh->rtnh_ifindex)
4347                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4348
4349                 attrlen = rtnh_attrlen(rtnh);
4350                 if (attrlen > 0) {
4351                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4352
4353                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4354                         if (nla) {
4355                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4356                                 r_cfg.fc_flags |= RTF_GATEWAY;
4357                         }
4358                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4359                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4360                         if (nla)
4361                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4362                 }
4363
4364                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4365                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4366                 if (IS_ERR(rt)) {
4367                         err = PTR_ERR(rt);
4368                         rt = NULL;
4369                         goto cleanup;
4370                 }
4371                 if (!rt6_qualify_for_ecmp(rt)) {
4372                         err = -EINVAL;
4373                         NL_SET_ERR_MSG(extack,
4374                                        "Device only routes can not be added for IPv6 using the multipath API.");
4375                         fib6_info_release(rt);
4376                         goto cleanup;
4377                 }
4378
4379                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4380
4381                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4382                                             rt, &r_cfg);
4383                 if (err) {
4384                         fib6_info_release(rt);
4385                         goto cleanup;
4386                 }
4387
4388                 rtnh = rtnh_next(rtnh, &remaining);
4389         }
4390
4391         /* for add and replace send one notification with all nexthops.
4392          * Skip the notification in fib6_add_rt2node and send one with
4393          * the full route when done
4394          */
4395         info->skip_notify = 1;
4396
4397         err_nh = NULL;
4398         list_for_each_entry(nh, &rt6_nh_list, next) {
4399                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4400                 fib6_info_release(nh->fib6_info);
4401
4402                 if (!err) {
4403                         /* save reference to last route successfully inserted */
4404                         rt_last = nh->fib6_info;
4405
4406                         /* save reference to first route for notification */
4407                         if (!rt_notif)
4408                                 rt_notif = nh->fib6_info;
4409                 }
4410
4411                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4412                 nh->fib6_info = NULL;
4413                 if (err) {
4414                         if (replace && nhn)
4415                                 NL_SET_ERR_MSG_MOD(extack,
4416                                                    "multipath route replace failed (check consistency of installed routes)");
4417                         err_nh = nh;
4418                         goto add_errout;
4419                 }
4420
4421                 /* Because each route is added like a single route we remove
4422                  * these flags after the first nexthop: if there is a collision,
4423                  * we have already failed to add the first nexthop:
4424                  * fib6_add_rt2node() has rejected it; when replacing, old
4425                  * nexthops have been replaced by first new, the rest should
4426                  * be added to it.
4427                  */
4428                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4429                                                      NLM_F_REPLACE);
4430                 nhn++;
4431         }
4432
4433         /* success ... tell user about new route */
4434         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4435         goto cleanup;
4436
4437 add_errout:
4438         /* send notification for routes that were added so that
4439          * the delete notifications sent by ip6_route_del are
4440          * coherent
4441          */
4442         if (rt_notif)
4443                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4444
4445         /* Delete routes that were already added */
4446         list_for_each_entry(nh, &rt6_nh_list, next) {
4447                 if (err_nh == nh)
4448                         break;
4449                 ip6_route_del(&nh->r_cfg, extack);
4450         }
4451
4452 cleanup:
4453         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4454                 if (nh->fib6_info)
4455                         fib6_info_release(nh->fib6_info);
4456                 list_del(&nh->next);
4457                 kfree(nh);
4458         }
4459
4460         return err;
4461 }
4462
4463 static int ip6_route_multipath_del(struct fib6_config *cfg,
4464                                    struct netlink_ext_ack *extack)
4465 {
4466         struct fib6_config r_cfg;
4467         struct rtnexthop *rtnh;
4468         int remaining;
4469         int attrlen;
4470         int err = 1, last_err = 0;
4471
4472         remaining = cfg->fc_mp_len;
4473         rtnh = (struct rtnexthop *)cfg->fc_mp;
4474
4475         /* Parse a Multipath Entry */
4476         while (rtnh_ok(rtnh, remaining)) {
4477                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4478                 if (rtnh->rtnh_ifindex)
4479                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4480
4481                 attrlen = rtnh_attrlen(rtnh);
4482                 if (attrlen > 0) {
4483                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4484
4485                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4486                         if (nla) {
4487                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4488                                 r_cfg.fc_flags |= RTF_GATEWAY;
4489                         }
4490                 }
4491                 err = ip6_route_del(&r_cfg, extack);
4492                 if (err)
4493                         last_err = err;
4494
4495                 rtnh = rtnh_next(rtnh, &remaining);
4496         }
4497
4498         return last_err;
4499 }
4500
4501 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4502                               struct netlink_ext_ack *extack)
4503 {
4504         struct fib6_config cfg;
4505         int err;
4506
4507         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4508         if (err < 0)
4509                 return err;
4510
4511         if (cfg.fc_mp)
4512                 return ip6_route_multipath_del(&cfg, extack);
4513         else {
4514                 cfg.fc_delete_all_nh = 1;
4515                 return ip6_route_del(&cfg, extack);
4516         }
4517 }
4518
4519 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4520                               struct netlink_ext_ack *extack)
4521 {
4522         struct fib6_config cfg;
4523         int err;
4524
4525         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4526         if (err < 0)
4527                 return err;
4528
4529         if (cfg.fc_mp)
4530                 return ip6_route_multipath_add(&cfg, extack);
4531         else
4532                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4533 }
4534
4535 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4536 {
4537         int nexthop_len = 0;
4538
4539         if (rt->fib6_nsiblings) {
4540                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4541                             + NLA_ALIGN(sizeof(struct rtnexthop))
4542                             + nla_total_size(16) /* RTA_GATEWAY */
4543                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4544
4545                 nexthop_len *= rt->fib6_nsiblings;
4546         }
4547
4548         return NLMSG_ALIGN(sizeof(struct rtmsg))
4549                + nla_total_size(16) /* RTA_SRC */
4550                + nla_total_size(16) /* RTA_DST */
4551                + nla_total_size(16) /* RTA_GATEWAY */
4552                + nla_total_size(16) /* RTA_PREFSRC */
4553                + nla_total_size(4) /* RTA_TABLE */
4554                + nla_total_size(4) /* RTA_IIF */
4555                + nla_total_size(4) /* RTA_OIF */
4556                + nla_total_size(4) /* RTA_PRIORITY */
4557                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4558                + nla_total_size(sizeof(struct rta_cacheinfo))
4559                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4560                + nla_total_size(1) /* RTA_PREF */
4561                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4562                + nexthop_len;
4563 }
4564
4565 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4566                             unsigned int *flags, bool skip_oif)
4567 {
4568         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4569                 *flags |= RTNH_F_DEAD;
4570
4571         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4572                 *flags |= RTNH_F_LINKDOWN;
4573
4574                 rcu_read_lock();
4575                 if (fib6_ignore_linkdown(rt))
4576                         *flags |= RTNH_F_DEAD;
4577                 rcu_read_unlock();
4578         }
4579
4580         if (rt->fib6_flags & RTF_GATEWAY) {
4581                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4582                         goto nla_put_failure;
4583         }
4584
4585         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4586         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4587                 *flags |= RTNH_F_OFFLOAD;
4588
4589         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4590         if (!skip_oif && rt->fib6_nh.nh_dev &&
4591             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4592                 goto nla_put_failure;
4593
4594         if (rt->fib6_nh.nh_lwtstate &&
4595             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4596                 goto nla_put_failure;
4597
4598         return 0;
4599
4600 nla_put_failure:
4601         return -EMSGSIZE;
4602 }
4603
4604 /* add multipath next hop */
4605 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4606 {
4607         const struct net_device *dev = rt->fib6_nh.nh_dev;
4608         struct rtnexthop *rtnh;
4609         unsigned int flags = 0;
4610
4611         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4612         if (!rtnh)
4613                 goto nla_put_failure;
4614
4615         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4616         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4617
4618         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4619                 goto nla_put_failure;
4620
4621         rtnh->rtnh_flags = flags;
4622
4623         /* length of rtnetlink header + attributes */
4624         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4625
4626         return 0;
4627
4628 nla_put_failure:
4629         return -EMSGSIZE;
4630 }
4631
4632 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4633                          struct fib6_info *rt, struct dst_entry *dst,
4634                          struct in6_addr *dest, struct in6_addr *src,
4635                          int iif, int type, u32 portid, u32 seq,
4636                          unsigned int flags)
4637 {
4638         struct rt6_info *rt6 = (struct rt6_info *)dst;
4639         struct rt6key *rt6_dst, *rt6_src;
4640         u32 *pmetrics, table, rt6_flags;
4641         struct nlmsghdr *nlh;
4642         struct rtmsg *rtm;
4643         long expires = 0;
4644
4645         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4646         if (!nlh)
4647                 return -EMSGSIZE;
4648
4649         if (rt6) {
4650                 rt6_dst = &rt6->rt6i_dst;
4651                 rt6_src = &rt6->rt6i_src;
4652                 rt6_flags = rt6->rt6i_flags;
4653         } else {
4654                 rt6_dst = &rt->fib6_dst;
4655                 rt6_src = &rt->fib6_src;
4656                 rt6_flags = rt->fib6_flags;
4657         }
4658
4659         rtm = nlmsg_data(nlh);
4660         rtm->rtm_family = AF_INET6;
4661         rtm->rtm_dst_len = rt6_dst->plen;
4662         rtm->rtm_src_len = rt6_src->plen;
4663         rtm->rtm_tos = 0;
4664         if (rt->fib6_table)
4665                 table = rt->fib6_table->tb6_id;
4666         else
4667                 table = RT6_TABLE_UNSPEC;
4668         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4669         if (nla_put_u32(skb, RTA_TABLE, table))
4670                 goto nla_put_failure;
4671
4672         rtm->rtm_type = rt->fib6_type;
4673         rtm->rtm_flags = 0;
4674         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4675         rtm->rtm_protocol = rt->fib6_protocol;
4676
4677         if (rt6_flags & RTF_CACHE)
4678                 rtm->rtm_flags |= RTM_F_CLONED;
4679
4680         if (dest) {
4681                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4682                         goto nla_put_failure;
4683                 rtm->rtm_dst_len = 128;
4684         } else if (rtm->rtm_dst_len)
4685                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4686                         goto nla_put_failure;
4687 #ifdef CONFIG_IPV6_SUBTREES
4688         if (src) {
4689                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4690                         goto nla_put_failure;
4691                 rtm->rtm_src_len = 128;
4692         } else if (rtm->rtm_src_len &&
4693                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4694                 goto nla_put_failure;
4695 #endif
4696         if (iif) {
4697 #ifdef CONFIG_IPV6_MROUTE
4698                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4699                         int err = ip6mr_get_route(net, skb, rtm, portid);
4700
4701                         if (err == 0)
4702                                 return 0;
4703                         if (err < 0)
4704                                 goto nla_put_failure;
4705                 } else
4706 #endif
4707                         if (nla_put_u32(skb, RTA_IIF, iif))
4708                                 goto nla_put_failure;
4709         } else if (dest) {
4710                 struct in6_addr saddr_buf;
4711                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4712                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4713                         goto nla_put_failure;
4714         }
4715
4716         if (rt->fib6_prefsrc.plen) {
4717                 struct in6_addr saddr_buf;
4718                 saddr_buf = rt->fib6_prefsrc.addr;
4719                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4720                         goto nla_put_failure;
4721         }
4722
4723         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4724         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4725                 goto nla_put_failure;
4726
4727         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4728                 goto nla_put_failure;
4729
4730         /* For multipath routes, walk the siblings list and add
4731          * each as a nexthop within RTA_MULTIPATH.
4732          */
4733         if (rt6) {
4734                 if (rt6_flags & RTF_GATEWAY &&
4735                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4736                         goto nla_put_failure;
4737
4738                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4739                         goto nla_put_failure;
4740         } else if (rt->fib6_nsiblings) {
4741                 struct fib6_info *sibling, *next_sibling;
4742                 struct nlattr *mp;
4743
4744                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4745                 if (!mp)
4746                         goto nla_put_failure;
4747
4748                 if (rt6_add_nexthop(skb, rt) < 0)
4749                         goto nla_put_failure;
4750
4751                 list_for_each_entry_safe(sibling, next_sibling,
4752                                          &rt->fib6_siblings, fib6_siblings) {
4753                         if (rt6_add_nexthop(skb, sibling) < 0)
4754                                 goto nla_put_failure;
4755                 }
4756
4757                 nla_nest_end(skb, mp);
4758         } else {
4759                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4760                         goto nla_put_failure;
4761         }
4762
4763         if (rt6_flags & RTF_EXPIRES) {
4764                 expires = dst ? dst->expires : rt->expires;
4765                 expires -= jiffies;
4766         }
4767
4768         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4769                 goto nla_put_failure;
4770
4771         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4772                 goto nla_put_failure;
4773
4774
4775         nlmsg_end(skb, nlh);
4776         return 0;
4777
4778 nla_put_failure:
4779         nlmsg_cancel(skb, nlh);
4780         return -EMSGSIZE;
4781 }
4782
4783 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4784                                const struct net_device *dev)
4785 {
4786         if (f6i->fib6_nh.nh_dev == dev)
4787                 return true;
4788
4789         if (f6i->fib6_nsiblings) {
4790                 struct fib6_info *sibling, *next_sibling;
4791
4792                 list_for_each_entry_safe(sibling, next_sibling,
4793                                          &f6i->fib6_siblings, fib6_siblings) {
4794                         if (sibling->fib6_nh.nh_dev == dev)
4795                                 return true;
4796                 }
4797         }
4798
4799         return false;
4800 }
4801
4802 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4803 {
4804         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4805         struct fib_dump_filter *filter = &arg->filter;
4806         unsigned int flags = NLM_F_MULTI;
4807         struct net *net = arg->net;
4808
4809         if (rt == net->ipv6.fib6_null_entry)
4810                 return 0;
4811
4812         if ((filter->flags & RTM_F_PREFIX) &&
4813             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4814                 /* success since this is not a prefix route */
4815                 return 1;
4816         }
4817         if (filter->filter_set) {
4818                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4819                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4820                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4821                         return 1;
4822                 }
4823                 flags |= NLM_F_DUMP_FILTERED;
4824         }
4825
4826         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4827                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4828                              arg->cb->nlh->nlmsg_seq, flags);
4829 }
4830
4831 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4832                               struct netlink_ext_ack *extack)
4833 {
4834         struct net *net = sock_net(in_skb->sk);
4835         struct nlattr *tb[RTA_MAX+1];
4836         int err, iif = 0, oif = 0;
4837         struct fib6_info *from;
4838         struct dst_entry *dst;
4839         struct rt6_info *rt;
4840         struct sk_buff *skb;
4841         struct rtmsg *rtm;
4842         struct flowi6 fl6 = {};
4843         bool fibmatch;
4844
4845         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4846                           extack);
4847         if (err < 0)
4848                 goto errout;
4849
4850         err = -EINVAL;
4851         rtm = nlmsg_data(nlh);
4852         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4853         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4854
4855         if (tb[RTA_SRC]) {
4856                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4857                         goto errout;
4858
4859                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4860         }
4861
4862         if (tb[RTA_DST]) {
4863                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4864                         goto errout;
4865
4866                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4867         }
4868
4869         if (tb[RTA_IIF])
4870                 iif = nla_get_u32(tb[RTA_IIF]);
4871
4872         if (tb[RTA_OIF])
4873                 oif = nla_get_u32(tb[RTA_OIF]);
4874
4875         if (tb[RTA_MARK])
4876                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4877
4878         if (tb[RTA_UID])
4879                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4880                                            nla_get_u32(tb[RTA_UID]));
4881         else
4882                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4883
4884         if (tb[RTA_SPORT])
4885                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4886
4887         if (tb[RTA_DPORT])
4888                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4889
4890         if (tb[RTA_IP_PROTO]) {
4891                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4892                                                   &fl6.flowi6_proto, extack);
4893                 if (err)
4894                         goto errout;
4895         }
4896
4897         if (iif) {
4898                 struct net_device *dev;
4899                 int flags = 0;
4900
4901                 rcu_read_lock();
4902
4903                 dev = dev_get_by_index_rcu(net, iif);
4904                 if (!dev) {
4905                         rcu_read_unlock();
4906                         err = -ENODEV;
4907                         goto errout;
4908                 }
4909
4910                 fl6.flowi6_iif = iif;
4911
4912                 if (!ipv6_addr_any(&fl6.saddr))
4913                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4914
4915                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4916
4917                 rcu_read_unlock();
4918         } else {
4919                 fl6.flowi6_oif = oif;
4920
4921                 dst = ip6_route_output(net, NULL, &fl6);
4922         }
4923
4924
4925         rt = container_of(dst, struct rt6_info, dst);
4926         if (rt->dst.error) {
4927                 err = rt->dst.error;
4928                 ip6_rt_put(rt);
4929                 goto errout;
4930         }
4931
4932         if (rt == net->ipv6.ip6_null_entry) {
4933                 err = rt->dst.error;
4934                 ip6_rt_put(rt);
4935                 goto errout;
4936         }
4937
4938         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4939         if (!skb) {
4940                 ip6_rt_put(rt);
4941                 err = -ENOBUFS;
4942                 goto errout;
4943         }
4944
4945         skb_dst_set(skb, &rt->dst);
4946
4947         rcu_read_lock();
4948         from = rcu_dereference(rt->from);
4949
4950         if (fibmatch)
4951                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4952                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4953                                     nlh->nlmsg_seq, 0);
4954         else
4955                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4956                                     &fl6.saddr, iif, RTM_NEWROUTE,
4957                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4958                                     0);
4959         rcu_read_unlock();
4960
4961         if (err < 0) {
4962                 kfree_skb(skb);
4963                 goto errout;
4964         }
4965
4966         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4967 errout:
4968         return err;
4969 }
4970
4971 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4972                      unsigned int nlm_flags)
4973 {
4974         struct sk_buff *skb;
4975         struct net *net = info->nl_net;
4976         u32 seq;
4977         int err;
4978
4979         err = -ENOBUFS;
4980         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4981
4982         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4983         if (!skb)
4984                 goto errout;
4985
4986         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4987                             event, info->portid, seq, nlm_flags);
4988         if (err < 0) {
4989                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4990                 WARN_ON(err == -EMSGSIZE);
4991                 kfree_skb(skb);
4992                 goto errout;
4993         }
4994         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4995                     info->nlh, gfp_any());
4996         return;
4997 errout:
4998         if (err < 0)
4999                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5000 }
5001
5002 static int ip6_route_dev_notify(struct notifier_block *this,
5003                                 unsigned long event, void *ptr)
5004 {
5005         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5006         struct net *net = dev_net(dev);
5007
5008         if (!(dev->flags & IFF_LOOPBACK))
5009                 return NOTIFY_OK;
5010
5011         if (event == NETDEV_REGISTER) {
5012                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5013                 net->ipv6.ip6_null_entry->dst.dev = dev;
5014                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5015 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5016                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5017                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5018                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5019                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5020 #endif
5021          } else if (event == NETDEV_UNREGISTER &&
5022                     dev->reg_state != NETREG_UNREGISTERED) {
5023                 /* NETDEV_UNREGISTER could be fired for multiple times by
5024                  * netdev_wait_allrefs(). Make sure we only call this once.
5025                  */
5026                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5027 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5028                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5029                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5030 #endif
5031         }
5032
5033         return NOTIFY_OK;
5034 }
5035
5036 /*
5037  *      /proc
5038  */
5039
5040 #ifdef CONFIG_PROC_FS
5041 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5042 {
5043         struct net *net = (struct net *)seq->private;
5044         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5045                    net->ipv6.rt6_stats->fib_nodes,
5046                    net->ipv6.rt6_stats->fib_route_nodes,
5047                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5048                    net->ipv6.rt6_stats->fib_rt_entries,
5049                    net->ipv6.rt6_stats->fib_rt_cache,
5050                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5051                    net->ipv6.rt6_stats->fib_discarded_routes);
5052
5053         return 0;
5054 }
5055 #endif  /* CONFIG_PROC_FS */
5056
5057 #ifdef CONFIG_SYSCTL
5058
5059 static
5060 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5061                               void __user *buffer, size_t *lenp, loff_t *ppos)
5062 {
5063         struct net *net;
5064         int delay;
5065         int ret;
5066         if (!write)
5067                 return -EINVAL;
5068
5069         net = (struct net *)ctl->extra1;
5070         delay = net->ipv6.sysctl.flush_delay;
5071         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5072         if (ret)
5073                 return ret;
5074
5075         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5076         return 0;
5077 }
5078
5079 static int zero;
5080 static int one = 1;
5081
5082 static struct ctl_table ipv6_route_table_template[] = {
5083         {
5084                 .procname       =       "flush",
5085                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5086                 .maxlen         =       sizeof(int),
5087                 .mode           =       0200,
5088                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5089         },
5090         {
5091                 .procname       =       "gc_thresh",
5092                 .data           =       &ip6_dst_ops_template.gc_thresh,
5093                 .maxlen         =       sizeof(int),
5094                 .mode           =       0644,
5095                 .proc_handler   =       proc_dointvec,
5096         },
5097         {
5098                 .procname       =       "max_size",
5099                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5100                 .maxlen         =       sizeof(int),
5101                 .mode           =       0644,
5102                 .proc_handler   =       proc_dointvec,
5103         },
5104         {
5105                 .procname       =       "gc_min_interval",
5106                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5107                 .maxlen         =       sizeof(int),
5108                 .mode           =       0644,
5109                 .proc_handler   =       proc_dointvec_jiffies,
5110         },
5111         {
5112                 .procname       =       "gc_timeout",
5113                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5114                 .maxlen         =       sizeof(int),
5115                 .mode           =       0644,
5116                 .proc_handler   =       proc_dointvec_jiffies,
5117         },
5118         {
5119                 .procname       =       "gc_interval",
5120                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5121                 .maxlen         =       sizeof(int),
5122                 .mode           =       0644,
5123                 .proc_handler   =       proc_dointvec_jiffies,
5124         },
5125         {
5126                 .procname       =       "gc_elasticity",
5127                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5128                 .maxlen         =       sizeof(int),
5129                 .mode           =       0644,
5130                 .proc_handler   =       proc_dointvec,
5131         },
5132         {
5133                 .procname       =       "mtu_expires",
5134                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5135                 .maxlen         =       sizeof(int),
5136                 .mode           =       0644,
5137                 .proc_handler   =       proc_dointvec_jiffies,
5138         },
5139         {
5140                 .procname       =       "min_adv_mss",
5141                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5142                 .maxlen         =       sizeof(int),
5143                 .mode           =       0644,
5144                 .proc_handler   =       proc_dointvec,
5145         },
5146         {
5147                 .procname       =       "gc_min_interval_ms",
5148                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5149                 .maxlen         =       sizeof(int),
5150                 .mode           =       0644,
5151                 .proc_handler   =       proc_dointvec_ms_jiffies,
5152         },
5153         {
5154                 .procname       =       "skip_notify_on_dev_down",
5155                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5156                 .maxlen         =       sizeof(int),
5157                 .mode           =       0644,
5158                 .proc_handler   =       proc_dointvec,
5159                 .extra1         =       &zero,
5160                 .extra2         =       &one,
5161         },
5162         { }
5163 };
5164
5165 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5166 {
5167         struct ctl_table *table;
5168
5169         table = kmemdup(ipv6_route_table_template,
5170                         sizeof(ipv6_route_table_template),
5171                         GFP_KERNEL);
5172
5173         if (table) {
5174                 table[0].data = &net->ipv6.sysctl.flush_delay;
5175                 table[0].extra1 = net;
5176                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5177                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5178                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5179                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5180                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5181                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5182                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5183                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5184                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5185                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5186
5187                 /* Don't export sysctls to unprivileged users */
5188                 if (net->user_ns != &init_user_ns)
5189                         table[0].procname = NULL;
5190         }
5191
5192         return table;
5193 }
5194 #endif
5195
5196 static int __net_init ip6_route_net_init(struct net *net)
5197 {
5198         int ret = -ENOMEM;
5199
5200         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5201                sizeof(net->ipv6.ip6_dst_ops));
5202
5203         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5204                 goto out_ip6_dst_ops;
5205
5206         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5207                                             sizeof(*net->ipv6.fib6_null_entry),
5208                                             GFP_KERNEL);
5209         if (!net->ipv6.fib6_null_entry)
5210                 goto out_ip6_dst_entries;
5211
5212         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5213                                            sizeof(*net->ipv6.ip6_null_entry),
5214                                            GFP_KERNEL);
5215         if (!net->ipv6.ip6_null_entry)
5216                 goto out_fib6_null_entry;
5217         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5218         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5219                          ip6_template_metrics, true);
5220
5221 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5222         net->ipv6.fib6_has_custom_rules = false;
5223         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5224                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5225                                                GFP_KERNEL);
5226         if (!net->ipv6.ip6_prohibit_entry)
5227                 goto out_ip6_null_entry;
5228         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5229         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5230                          ip6_template_metrics, true);
5231
5232         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5233                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5234                                                GFP_KERNEL);
5235         if (!net->ipv6.ip6_blk_hole_entry)
5236                 goto out_ip6_prohibit_entry;
5237         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5238         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5239                          ip6_template_metrics, true);
5240 #endif
5241
5242         net->ipv6.sysctl.flush_delay = 0;
5243         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5244         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5245         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5246         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5247         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5248         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5249         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5250         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5251
5252         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5253
5254         ret = 0;
5255 out:
5256         return ret;
5257
5258 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5259 out_ip6_prohibit_entry:
5260         kfree(net->ipv6.ip6_prohibit_entry);
5261 out_ip6_null_entry:
5262         kfree(net->ipv6.ip6_null_entry);
5263 #endif
5264 out_fib6_null_entry:
5265         kfree(net->ipv6.fib6_null_entry);
5266 out_ip6_dst_entries:
5267         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5268 out_ip6_dst_ops:
5269         goto out;
5270 }
5271
5272 static void __net_exit ip6_route_net_exit(struct net *net)
5273 {
5274         kfree(net->ipv6.fib6_null_entry);
5275         kfree(net->ipv6.ip6_null_entry);
5276 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5277         kfree(net->ipv6.ip6_prohibit_entry);
5278         kfree(net->ipv6.ip6_blk_hole_entry);
5279 #endif
5280         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5281 }
5282
5283 static int __net_init ip6_route_net_init_late(struct net *net)
5284 {
5285 #ifdef CONFIG_PROC_FS
5286         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5287                         sizeof(struct ipv6_route_iter));
5288         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5289                         rt6_stats_seq_show, NULL);
5290 #endif
5291         return 0;
5292 }
5293
5294 static void __net_exit ip6_route_net_exit_late(struct net *net)
5295 {
5296 #ifdef CONFIG_PROC_FS
5297         remove_proc_entry("ipv6_route", net->proc_net);
5298         remove_proc_entry("rt6_stats", net->proc_net);
5299 #endif
5300 }
5301
5302 static struct pernet_operations ip6_route_net_ops = {
5303         .init = ip6_route_net_init,
5304         .exit = ip6_route_net_exit,
5305 };
5306
5307 static int __net_init ipv6_inetpeer_init(struct net *net)
5308 {
5309         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5310
5311         if (!bp)
5312                 return -ENOMEM;
5313         inet_peer_base_init(bp);
5314         net->ipv6.peers = bp;
5315         return 0;
5316 }
5317
5318 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5319 {
5320         struct inet_peer_base *bp = net->ipv6.peers;
5321
5322         net->ipv6.peers = NULL;
5323         inetpeer_invalidate_tree(bp);
5324         kfree(bp);
5325 }
5326
5327 static struct pernet_operations ipv6_inetpeer_ops = {
5328         .init   =       ipv6_inetpeer_init,
5329         .exit   =       ipv6_inetpeer_exit,
5330 };
5331
5332 static struct pernet_operations ip6_route_net_late_ops = {
5333         .init = ip6_route_net_init_late,
5334         .exit = ip6_route_net_exit_late,
5335 };
5336
5337 static struct notifier_block ip6_route_dev_notifier = {
5338         .notifier_call = ip6_route_dev_notify,
5339         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5340 };
5341
5342 void __init ip6_route_init_special_entries(void)
5343 {
5344         /* Registering of the loopback is done before this portion of code,
5345          * the loopback reference in rt6_info will not be taken, do it
5346          * manually for init_net */
5347         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5348         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5349         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5350   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5351         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5352         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5353         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5354         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5355   #endif
5356 }
5357
5358 int __init ip6_route_init(void)
5359 {
5360         int ret;
5361         int cpu;
5362
5363         ret = -ENOMEM;
5364         ip6_dst_ops_template.kmem_cachep =
5365                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5366                                   SLAB_HWCACHE_ALIGN, NULL);
5367         if (!ip6_dst_ops_template.kmem_cachep)
5368                 goto out;
5369
5370         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5371         if (ret)
5372                 goto out_kmem_cache;
5373
5374         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5375         if (ret)
5376                 goto out_dst_entries;
5377
5378         ret = register_pernet_subsys(&ip6_route_net_ops);
5379         if (ret)
5380                 goto out_register_inetpeer;
5381
5382         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5383
5384         ret = fib6_init();
5385         if (ret)
5386                 goto out_register_subsys;
5387
5388         ret = xfrm6_init();
5389         if (ret)
5390                 goto out_fib6_init;
5391
5392         ret = fib6_rules_init();
5393         if (ret)
5394                 goto xfrm6_init;
5395
5396         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5397         if (ret)
5398                 goto fib6_rules_init;
5399
5400         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5401                                    inet6_rtm_newroute, NULL, 0);
5402         if (ret < 0)
5403                 goto out_register_late_subsys;
5404
5405         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5406                                    inet6_rtm_delroute, NULL, 0);
5407         if (ret < 0)
5408                 goto out_register_late_subsys;
5409
5410         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5411                                    inet6_rtm_getroute, NULL,
5412                                    RTNL_FLAG_DOIT_UNLOCKED);
5413         if (ret < 0)
5414                 goto out_register_late_subsys;
5415
5416         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5417         if (ret)
5418                 goto out_register_late_subsys;
5419
5420         for_each_possible_cpu(cpu) {
5421                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5422
5423                 INIT_LIST_HEAD(&ul->head);
5424                 spin_lock_init(&ul->lock);
5425         }
5426
5427 out:
5428         return ret;
5429
5430 out_register_late_subsys:
5431         rtnl_unregister_all(PF_INET6);
5432         unregister_pernet_subsys(&ip6_route_net_late_ops);
5433 fib6_rules_init:
5434         fib6_rules_cleanup();
5435 xfrm6_init:
5436         xfrm6_fini();
5437 out_fib6_init:
5438         fib6_gc_cleanup();
5439 out_register_subsys:
5440         unregister_pernet_subsys(&ip6_route_net_ops);
5441 out_register_inetpeer:
5442         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5443 out_dst_entries:
5444         dst_entries_destroy(&ip6_dst_blackhole_ops);
5445 out_kmem_cache:
5446         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5447         goto out;
5448 }
5449
5450 void ip6_route_cleanup(void)
5451 {
5452         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5453         unregister_pernet_subsys(&ip6_route_net_late_ops);
5454         fib6_rules_cleanup();
5455         xfrm6_fini();
5456         fib6_gc_cleanup();
5457         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5458         unregister_pernet_subsys(&ip6_route_net_ops);
5459         dst_entries_destroy(&ip6_dst_blackhole_ops);
5460         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5461 }
This page took 0.337741 seconds and 4 git commands to generate.