]> Git Repo - linux.git/blob - net/ipv6/route.c
ipv6: fix neighbour resolution with raw socket
[linux.git] / net / ipv6 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      Linux INET6 implementation
4  *      FIB front-end.
5  *
6  *      Authors:
7  *      Pedro Roque             <[email protected]>
8  */
9
10 /*      Changes:
11  *
12  *      YOSHIFUJI Hideaki @USAGI
13  *              reworked default router selection.
14  *              - respect outgoing interface
15  *              - select from (probably) reachable routers (i.e.
16  *              routers in REACHABLE, STALE, DELAY or PROBE states).
17  *              - always select the same router if it is (probably)
18  *              reachable.  otherwise, round-robin the list.
19  *      Ville Nuorvala
20  *              Fixed routing subtrees.
21  */
22
23 #define pr_fmt(fmt) "IPv6: " fmt
24
25 #include <linux/capability.h>
26 #include <linux/errno.h>
27 #include <linux/export.h>
28 #include <linux/types.h>
29 #include <linux/times.h>
30 #include <linux/socket.h>
31 #include <linux/sockios.h>
32 #include <linux/net.h>
33 #include <linux/route.h>
34 #include <linux/netdevice.h>
35 #include <linux/in6.h>
36 #include <linux/mroute6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39 #include <linux/proc_fs.h>
40 #include <linux/seq_file.h>
41 #include <linux/nsproxy.h>
42 #include <linux/slab.h>
43 #include <linux/jhash.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/dst_metadata.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58 #include <net/rtnh.h>
59 #include <net/lwtunnel.h>
60 #include <net/ip_tunnels.h>
61 #include <net/l3mdev.h>
62 #include <net/ip.h>
63 #include <linux/uaccess.h>
64
65 #ifdef CONFIG_SYSCTL
66 #include <linux/sysctl.h>
67 #endif
68
69 static int ip6_rt_type_to_error(u8 fib6_type);
70
71 #define CREATE_TRACE_POINTS
72 #include <trace/events/fib6.h>
73 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
74 #undef CREATE_TRACE_POINTS
75
76 enum rt6_nud_state {
77         RT6_NUD_FAIL_HARD = -3,
78         RT6_NUD_FAIL_PROBE = -2,
79         RT6_NUD_FAIL_DO_RR = -1,
80         RT6_NUD_SUCCEED = 1
81 };
82
83 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
84 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
85 static unsigned int      ip6_mtu(const struct dst_entry *dst);
86 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
87 static void             ip6_dst_destroy(struct dst_entry *);
88 static void             ip6_dst_ifdown(struct dst_entry *,
89                                        struct net_device *dev, int how);
90 static int               ip6_dst_gc(struct dst_ops *ops);
91
92 static int              ip6_pkt_discard(struct sk_buff *skb);
93 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static int              ip6_pkt_prohibit(struct sk_buff *skb);
95 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
96 static void             ip6_link_failure(struct sk_buff *skb);
97 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
98                                            struct sk_buff *skb, u32 mtu);
99 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100                                         struct sk_buff *skb);
101 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
102                            int strict);
103 static size_t rt6_nlmsg_size(struct fib6_info *rt);
104 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
105                          struct fib6_info *rt, struct dst_entry *dst,
106                          struct in6_addr *dest, struct in6_addr *src,
107                          int iif, int type, u32 portid, u32 seq,
108                          unsigned int flags);
109 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
110                                            const struct in6_addr *daddr,
111                                            const struct in6_addr *saddr);
112
113 #ifdef CONFIG_IPV6_ROUTE_INFO
114 static struct fib6_info *rt6_add_route_info(struct net *net,
115                                            const struct in6_addr *prefix, int prefixlen,
116                                            const struct in6_addr *gwaddr,
117                                            struct net_device *dev,
118                                            unsigned int pref);
119 static struct fib6_info *rt6_get_route_info(struct net *net,
120                                            const struct in6_addr *prefix, int prefixlen,
121                                            const struct in6_addr *gwaddr,
122                                            struct net_device *dev);
123 #endif
124
125 struct uncached_list {
126         spinlock_t              lock;
127         struct list_head        head;
128 };
129
130 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131
132 void rt6_uncached_list_add(struct rt6_info *rt)
133 {
134         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135
136         rt->rt6i_uncached_list = ul;
137
138         spin_lock_bh(&ul->lock);
139         list_add_tail(&rt->rt6i_uncached, &ul->head);
140         spin_unlock_bh(&ul->lock);
141 }
142
143 void rt6_uncached_list_del(struct rt6_info *rt)
144 {
145         if (!list_empty(&rt->rt6i_uncached)) {
146                 struct uncached_list *ul = rt->rt6i_uncached_list;
147                 struct net *net = dev_net(rt->dst.dev);
148
149                 spin_lock_bh(&ul->lock);
150                 list_del(&rt->rt6i_uncached);
151                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
152                 spin_unlock_bh(&ul->lock);
153         }
154 }
155
156 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 {
158         struct net_device *loopback_dev = net->loopback_dev;
159         int cpu;
160
161         if (dev == loopback_dev)
162                 return;
163
164         for_each_possible_cpu(cpu) {
165                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
166                 struct rt6_info *rt;
167
168                 spin_lock_bh(&ul->lock);
169                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
170                         struct inet6_dev *rt_idev = rt->rt6i_idev;
171                         struct net_device *rt_dev = rt->dst.dev;
172
173                         if (rt_idev->dev == dev) {
174                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
175                                 in6_dev_put(rt_idev);
176                         }
177
178                         if (rt_dev == dev) {
179                                 rt->dst.dev = loopback_dev;
180                                 dev_hold(rt->dst.dev);
181                                 dev_put(rt_dev);
182                         }
183                 }
184                 spin_unlock_bh(&ul->lock);
185         }
186 }
187
188 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
189                                              struct sk_buff *skb,
190                                              const void *daddr)
191 {
192         if (!ipv6_addr_any(p))
193                 return (const void *) p;
194         else if (skb)
195                 return &ipv6_hdr(skb)->daddr;
196         return daddr;
197 }
198
199 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
200                                    struct net_device *dev,
201                                    struct sk_buff *skb,
202                                    const void *daddr)
203 {
204         struct neighbour *n;
205
206         daddr = choose_neigh_daddr(gw, skb, daddr);
207         n = __ipv6_neigh_lookup(dev, daddr);
208         if (n)
209                 return n;
210
211         n = neigh_create(&nd_tbl, daddr, dev);
212         return IS_ERR(n) ? NULL : n;
213 }
214
215 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
216                                               struct sk_buff *skb,
217                                               const void *daddr)
218 {
219         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
220
221         return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
222                                 dst->dev, skb, daddr);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       dst_cow_metrics_generic,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_dst_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_dst_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct fib6_info fib6_null_entry_template = {
293         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
294         .fib6_protocol  = RTPROT_KERNEL,
295         .fib6_metric    = ~(u32)0,
296         .fib6_ref       = REFCOUNT_INIT(1),
297         .fib6_type      = RTN_UNREACHABLE,
298         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
299 };
300
301 static const struct rt6_info ip6_null_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -ENETUNREACH,
307                 .input          = ip6_pkt_discard,
308                 .output         = ip6_pkt_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325 };
326
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328         .dst = {
329                 .__refcnt       = ATOMIC_INIT(1),
330                 .__use          = 1,
331                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
332                 .error          = -EINVAL,
333                 .input          = dst_discard,
334                 .output         = dst_discard_out,
335         },
336         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351                                int flags)
352 {
353         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354                                         1, DST_OBSOLETE_FORCE_CHK, flags);
355
356         if (rt) {
357                 rt6_info_init(rt);
358                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359         }
360
361         return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct fib6_info *from;
369         struct inet6_dev *idev;
370
371         ip_dst_metrics_put(dst);
372         rt6_uncached_list_del(rt);
373
374         idev = rt->rt6i_idev;
375         if (idev) {
376                 rt->rt6i_idev = NULL;
377                 in6_dev_put(idev);
378         }
379
380         from = xchg((__force struct fib6_info **)&rt->from, NULL);
381         fib6_info_release(from);
382 }
383
384 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
385                            int how)
386 {
387         struct rt6_info *rt = (struct rt6_info *)dst;
388         struct inet6_dev *idev = rt->rt6i_idev;
389         struct net_device *loopback_dev =
390                 dev_net(dev)->loopback_dev;
391
392         if (idev && idev->dev != loopback_dev) {
393                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
394                 if (loopback_idev) {
395                         rt->rt6i_idev = loopback_idev;
396                         in6_dev_put(idev);
397                 }
398         }
399 }
400
401 static bool __rt6_check_expired(const struct rt6_info *rt)
402 {
403         if (rt->rt6i_flags & RTF_EXPIRES)
404                 return time_after(jiffies, rt->dst.expires);
405         else
406                 return false;
407 }
408
409 static bool rt6_check_expired(const struct rt6_info *rt)
410 {
411         struct fib6_info *from;
412
413         from = rcu_dereference(rt->from);
414
415         if (rt->rt6i_flags & RTF_EXPIRES) {
416                 if (time_after(jiffies, rt->dst.expires))
417                         return true;
418         } else if (from) {
419                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
420                         fib6_check_expired(from);
421         }
422         return false;
423 }
424
425 void fib6_select_path(const struct net *net, struct fib6_result *res,
426                       struct flowi6 *fl6, int oif, bool have_oif_match,
427                       const struct sk_buff *skb, int strict)
428 {
429         struct fib6_info *sibling, *next_sibling;
430         struct fib6_info *match = res->f6i;
431
432         if (!match->fib6_nsiblings || have_oif_match)
433                 goto out;
434
435         /* We might have already computed the hash for ICMPv6 errors. In such
436          * case it will always be non-zero. Otherwise now is the time to do it.
437          */
438         if (!fl6->mp_hash)
439                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
440
441         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
442                 goto out;
443
444         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
445                                  fib6_siblings) {
446                 const struct fib6_nh *nh = &sibling->fib6_nh;
447                 int nh_upper_bound;
448
449                 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
450                 if (fl6->mp_hash > nh_upper_bound)
451                         continue;
452                 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
453                         break;
454                 match = sibling;
455                 break;
456         }
457
458 out:
459         res->f6i = match;
460         res->nh = &match->fib6_nh;
461 }
462
463 /*
464  *      Route lookup. rcu_read_lock() should be held.
465  */
466
467 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
468                                const struct in6_addr *saddr, int oif, int flags)
469 {
470         const struct net_device *dev;
471
472         if (nh->fib_nh_flags & RTNH_F_DEAD)
473                 return false;
474
475         dev = nh->fib_nh_dev;
476         if (oif) {
477                 if (dev->ifindex == oif)
478                         return true;
479         } else {
480                 if (ipv6_chk_addr(net, saddr, dev,
481                                   flags & RT6_LOOKUP_F_IFACE))
482                         return true;
483         }
484
485         return false;
486 }
487
488 static void rt6_device_match(struct net *net, struct fib6_result *res,
489                              const struct in6_addr *saddr, int oif, int flags)
490 {
491         struct fib6_info *f6i = res->f6i;
492         struct fib6_info *spf6i;
493         struct fib6_nh *nh;
494
495         if (!oif && ipv6_addr_any(saddr)) {
496                 nh = &f6i->fib6_nh;
497                 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
498                         goto out;
499         }
500
501         for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
502                 nh = &spf6i->fib6_nh;
503                 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
504                         res->f6i = spf6i;
505                         goto out;
506                 }
507         }
508
509         if (oif && flags & RT6_LOOKUP_F_IFACE) {
510                 res->f6i = net->ipv6.fib6_null_entry;
511                 nh = &res->f6i->fib6_nh;
512                 goto out;
513         }
514
515         nh = &f6i->fib6_nh;
516         if (nh->fib_nh_flags & RTNH_F_DEAD) {
517                 res->f6i = net->ipv6.fib6_null_entry;
518                 nh = &res->f6i->fib6_nh;
519         }
520 out:
521         res->nh = nh;
522         res->fib6_type = res->f6i->fib6_type;
523         res->fib6_flags = res->f6i->fib6_flags;
524 }
525
526 #ifdef CONFIG_IPV6_ROUTER_PREF
527 struct __rt6_probe_work {
528         struct work_struct work;
529         struct in6_addr target;
530         struct net_device *dev;
531 };
532
533 static void rt6_probe_deferred(struct work_struct *w)
534 {
535         struct in6_addr mcaddr;
536         struct __rt6_probe_work *work =
537                 container_of(w, struct __rt6_probe_work, work);
538
539         addrconf_addr_solict_mult(&work->target, &mcaddr);
540         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
541         dev_put(work->dev);
542         kfree(work);
543 }
544
545 static void rt6_probe(struct fib6_nh *fib6_nh)
546 {
547         struct __rt6_probe_work *work = NULL;
548         const struct in6_addr *nh_gw;
549         struct neighbour *neigh;
550         struct net_device *dev;
551         struct inet6_dev *idev;
552
553         /*
554          * Okay, this does not seem to be appropriate
555          * for now, however, we need to check if it
556          * is really so; aka Router Reachability Probing.
557          *
558          * Router Reachability Probe MUST be rate-limited
559          * to no more than one per minute.
560          */
561         if (fib6_nh->fib_nh_gw_family)
562                 return;
563
564         nh_gw = &fib6_nh->fib_nh_gw6;
565         dev = fib6_nh->fib_nh_dev;
566         rcu_read_lock_bh();
567         idev = __in6_dev_get(dev);
568         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
569         if (neigh) {
570                 if (neigh->nud_state & NUD_VALID)
571                         goto out;
572
573                 write_lock(&neigh->lock);
574                 if (!(neigh->nud_state & NUD_VALID) &&
575                     time_after(jiffies,
576                                neigh->updated + idev->cnf.rtr_probe_interval)) {
577                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
578                         if (work)
579                                 __neigh_set_probe_once(neigh);
580                 }
581                 write_unlock(&neigh->lock);
582         } else if (time_after(jiffies, fib6_nh->last_probe +
583                                        idev->cnf.rtr_probe_interval)) {
584                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
585         }
586
587         if (work) {
588                 fib6_nh->last_probe = jiffies;
589                 INIT_WORK(&work->work, rt6_probe_deferred);
590                 work->target = *nh_gw;
591                 dev_hold(dev);
592                 work->dev = dev;
593                 schedule_work(&work->work);
594         }
595
596 out:
597         rcu_read_unlock_bh();
598 }
599 #else
600 static inline void rt6_probe(struct fib6_nh *fib6_nh)
601 {
602 }
603 #endif
604
605 /*
606  * Default Router Selection (RFC 2461 6.3.6)
607  */
608 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
609 {
610         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
611         struct neighbour *neigh;
612
613         rcu_read_lock_bh();
614         neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
615                                           &fib6_nh->fib_nh_gw6);
616         if (neigh) {
617                 read_lock(&neigh->lock);
618                 if (neigh->nud_state & NUD_VALID)
619                         ret = RT6_NUD_SUCCEED;
620 #ifdef CONFIG_IPV6_ROUTER_PREF
621                 else if (!(neigh->nud_state & NUD_FAILED))
622                         ret = RT6_NUD_SUCCEED;
623                 else
624                         ret = RT6_NUD_FAIL_PROBE;
625 #endif
626                 read_unlock(&neigh->lock);
627         } else {
628                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
629                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
630         }
631         rcu_read_unlock_bh();
632
633         return ret;
634 }
635
636 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
637                            int strict)
638 {
639         int m = 0;
640
641         if (!oif || nh->fib_nh_dev->ifindex == oif)
642                 m = 2;
643
644         if (!m && (strict & RT6_LOOKUP_F_IFACE))
645                 return RT6_NUD_FAIL_HARD;
646 #ifdef CONFIG_IPV6_ROUTER_PREF
647         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
648 #endif
649         if ((strict & RT6_LOOKUP_F_REACHABLE) &&
650             !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
651                 int n = rt6_check_neigh(nh);
652                 if (n < 0)
653                         return n;
654         }
655         return m;
656 }
657
658 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
659                        int oif, int strict, int *mpri, bool *do_rr)
660 {
661         bool match_do_rr = false;
662         bool rc = false;
663         int m;
664
665         if (nh->fib_nh_flags & RTNH_F_DEAD)
666                 goto out;
667
668         if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
669             nh->fib_nh_flags & RTNH_F_LINKDOWN &&
670             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
671                 goto out;
672
673         m = rt6_score_route(nh, fib6_flags, oif, strict);
674         if (m == RT6_NUD_FAIL_DO_RR) {
675                 match_do_rr = true;
676                 m = 0; /* lowest valid score */
677         } else if (m == RT6_NUD_FAIL_HARD) {
678                 goto out;
679         }
680
681         if (strict & RT6_LOOKUP_F_REACHABLE)
682                 rt6_probe(nh);
683
684         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685         if (m > *mpri) {
686                 *do_rr = match_do_rr;
687                 *mpri = m;
688                 rc = true;
689         }
690 out:
691         return rc;
692 }
693
694 static void __find_rr_leaf(struct fib6_info *f6i_start,
695                            struct fib6_info *nomatch, u32 metric,
696                            struct fib6_result *res, struct fib6_info **cont,
697                            int oif, int strict, bool *do_rr, int *mpri)
698 {
699         struct fib6_info *f6i;
700
701         for (f6i = f6i_start;
702              f6i && f6i != nomatch;
703              f6i = rcu_dereference(f6i->fib6_next)) {
704                 struct fib6_nh *nh;
705
706                 if (cont && f6i->fib6_metric != metric) {
707                         *cont = f6i;
708                         return;
709                 }
710
711                 if (fib6_check_expired(f6i))
712                         continue;
713
714                 nh = &f6i->fib6_nh;
715                 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
716                         res->f6i = f6i;
717                         res->nh = nh;
718                         res->fib6_flags = f6i->fib6_flags;
719                         res->fib6_type = f6i->fib6_type;
720                 }
721         }
722 }
723
724 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
725                          struct fib6_info *rr_head, int oif, int strict,
726                          bool *do_rr, struct fib6_result *res)
727 {
728         u32 metric = rr_head->fib6_metric;
729         struct fib6_info *cont = NULL;
730         int mpri = -1;
731
732         __find_rr_leaf(rr_head, NULL, metric, res, &cont,
733                        oif, strict, do_rr, &mpri);
734
735         __find_rr_leaf(leaf, rr_head, metric, res, &cont,
736                        oif, strict, do_rr, &mpri);
737
738         if (res->f6i || !cont)
739                 return;
740
741         __find_rr_leaf(cont, NULL, metric, res, NULL,
742                        oif, strict, do_rr, &mpri);
743 }
744
745 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
746                        struct fib6_result *res, int strict)
747 {
748         struct fib6_info *leaf = rcu_dereference(fn->leaf);
749         struct fib6_info *rt0;
750         bool do_rr = false;
751         int key_plen;
752
753         /* make sure this function or its helpers sets f6i */
754         res->f6i = NULL;
755
756         if (!leaf || leaf == net->ipv6.fib6_null_entry)
757                 goto out;
758
759         rt0 = rcu_dereference(fn->rr_ptr);
760         if (!rt0)
761                 rt0 = leaf;
762
763         /* Double check to make sure fn is not an intermediate node
764          * and fn->leaf does not points to its child's leaf
765          * (This might happen if all routes under fn are deleted from
766          * the tree and fib6_repair_tree() is called on the node.)
767          */
768         key_plen = rt0->fib6_dst.plen;
769 #ifdef CONFIG_IPV6_SUBTREES
770         if (rt0->fib6_src.plen)
771                 key_plen = rt0->fib6_src.plen;
772 #endif
773         if (fn->fn_bit != key_plen)
774                 goto out;
775
776         find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
777         if (do_rr) {
778                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
779
780                 /* no entries matched; do round-robin */
781                 if (!next || next->fib6_metric != rt0->fib6_metric)
782                         next = leaf;
783
784                 if (next != rt0) {
785                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
786                         /* make sure next is not being deleted from the tree */
787                         if (next->fib6_node)
788                                 rcu_assign_pointer(fn->rr_ptr, next);
789                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
790                 }
791         }
792
793 out:
794         if (!res->f6i) {
795                 res->f6i = net->ipv6.fib6_null_entry;
796                 res->nh = &res->f6i->fib6_nh;
797                 res->fib6_flags = res->f6i->fib6_flags;
798                 res->fib6_type = res->f6i->fib6_type;
799         }
800 }
801
802 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
803 {
804         return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
805                res->nh->fib_nh_gw_family;
806 }
807
808 #ifdef CONFIG_IPV6_ROUTE_INFO
809 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
810                   const struct in6_addr *gwaddr)
811 {
812         struct net *net = dev_net(dev);
813         struct route_info *rinfo = (struct route_info *) opt;
814         struct in6_addr prefix_buf, *prefix;
815         unsigned int pref;
816         unsigned long lifetime;
817         struct fib6_info *rt;
818
819         if (len < sizeof(struct route_info)) {
820                 return -EINVAL;
821         }
822
823         /* Sanity check for prefix_len and length */
824         if (rinfo->length > 3) {
825                 return -EINVAL;
826         } else if (rinfo->prefix_len > 128) {
827                 return -EINVAL;
828         } else if (rinfo->prefix_len > 64) {
829                 if (rinfo->length < 2) {
830                         return -EINVAL;
831                 }
832         } else if (rinfo->prefix_len > 0) {
833                 if (rinfo->length < 1) {
834                         return -EINVAL;
835                 }
836         }
837
838         pref = rinfo->route_pref;
839         if (pref == ICMPV6_ROUTER_PREF_INVALID)
840                 return -EINVAL;
841
842         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
843
844         if (rinfo->length == 3)
845                 prefix = (struct in6_addr *)rinfo->prefix;
846         else {
847                 /* this function is safe */
848                 ipv6_addr_prefix(&prefix_buf,
849                                  (struct in6_addr *)rinfo->prefix,
850                                  rinfo->prefix_len);
851                 prefix = &prefix_buf;
852         }
853
854         if (rinfo->prefix_len == 0)
855                 rt = rt6_get_dflt_router(net, gwaddr, dev);
856         else
857                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
858                                         gwaddr, dev);
859
860         if (rt && !lifetime) {
861                 ip6_del_rt(net, rt);
862                 rt = NULL;
863         }
864
865         if (!rt && lifetime)
866                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
867                                         dev, pref);
868         else if (rt)
869                 rt->fib6_flags = RTF_ROUTEINFO |
870                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
871
872         if (rt) {
873                 if (!addrconf_finite_timeout(lifetime))
874                         fib6_clean_expires(rt);
875                 else
876                         fib6_set_expires(rt, jiffies + HZ * lifetime);
877
878                 fib6_info_release(rt);
879         }
880         return 0;
881 }
882 #endif
883
884 /*
885  *      Misc support functions
886  */
887
888 /* called with rcu_lock held */
889 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
890 {
891         struct net_device *dev = res->nh->fib_nh_dev;
892
893         if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
894                 /* for copies of local routes, dst->dev needs to be the
895                  * device if it is a master device, the master device if
896                  * device is enslaved, and the loopback as the default
897                  */
898                 if (netif_is_l3_slave(dev) &&
899                     !rt6_need_strict(&res->f6i->fib6_dst.addr))
900                         dev = l3mdev_master_dev_rcu(dev);
901                 else if (!netif_is_l3_master(dev))
902                         dev = dev_net(dev)->loopback_dev;
903                 /* last case is netif_is_l3_master(dev) is true in which
904                  * case we want dev returned to be dev
905                  */
906         }
907
908         return dev;
909 }
910
911 static const int fib6_prop[RTN_MAX + 1] = {
912         [RTN_UNSPEC]    = 0,
913         [RTN_UNICAST]   = 0,
914         [RTN_LOCAL]     = 0,
915         [RTN_BROADCAST] = 0,
916         [RTN_ANYCAST]   = 0,
917         [RTN_MULTICAST] = 0,
918         [RTN_BLACKHOLE] = -EINVAL,
919         [RTN_UNREACHABLE] = -EHOSTUNREACH,
920         [RTN_PROHIBIT]  = -EACCES,
921         [RTN_THROW]     = -EAGAIN,
922         [RTN_NAT]       = -EINVAL,
923         [RTN_XRESOLVE]  = -EINVAL,
924 };
925
926 static int ip6_rt_type_to_error(u8 fib6_type)
927 {
928         return fib6_prop[fib6_type];
929 }
930
931 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
932 {
933         unsigned short flags = 0;
934
935         if (rt->dst_nocount)
936                 flags |= DST_NOCOUNT;
937         if (rt->dst_nopolicy)
938                 flags |= DST_NOPOLICY;
939         if (rt->dst_host)
940                 flags |= DST_HOST;
941
942         return flags;
943 }
944
945 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
946 {
947         rt->dst.error = ip6_rt_type_to_error(fib6_type);
948
949         switch (fib6_type) {
950         case RTN_BLACKHOLE:
951                 rt->dst.output = dst_discard_out;
952                 rt->dst.input = dst_discard;
953                 break;
954         case RTN_PROHIBIT:
955                 rt->dst.output = ip6_pkt_prohibit_out;
956                 rt->dst.input = ip6_pkt_prohibit;
957                 break;
958         case RTN_THROW:
959         case RTN_UNREACHABLE:
960         default:
961                 rt->dst.output = ip6_pkt_discard_out;
962                 rt->dst.input = ip6_pkt_discard;
963                 break;
964         }
965 }
966
967 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
968 {
969         struct fib6_info *f6i = res->f6i;
970
971         if (res->fib6_flags & RTF_REJECT) {
972                 ip6_rt_init_dst_reject(rt, res->fib6_type);
973                 return;
974         }
975
976         rt->dst.error = 0;
977         rt->dst.output = ip6_output;
978
979         if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
980                 rt->dst.input = ip6_input;
981         } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
982                 rt->dst.input = ip6_mc_input;
983         } else {
984                 rt->dst.input = ip6_forward;
985         }
986
987         if (res->nh->fib_nh_lws) {
988                 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
989                 lwtunnel_set_redirect(&rt->dst);
990         }
991
992         rt->dst.lastuse = jiffies;
993 }
994
995 /* Caller must already hold reference to @from */
996 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
997 {
998         rt->rt6i_flags &= ~RTF_EXPIRES;
999         rcu_assign_pointer(rt->from, from);
1000         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1001 }
1002
1003 /* Caller must already hold reference to f6i in result */
1004 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1005 {
1006         const struct fib6_nh *nh = res->nh;
1007         const struct net_device *dev = nh->fib_nh_dev;
1008         struct fib6_info *f6i = res->f6i;
1009
1010         ip6_rt_init_dst(rt, res);
1011
1012         rt->rt6i_dst = f6i->fib6_dst;
1013         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1014         rt->rt6i_flags = res->fib6_flags;
1015         if (nh->fib_nh_gw_family) {
1016                 rt->rt6i_gateway = nh->fib_nh_gw6;
1017                 rt->rt6i_flags |= RTF_GATEWAY;
1018         }
1019         rt6_set_from(rt, f6i);
1020 #ifdef CONFIG_IPV6_SUBTREES
1021         rt->rt6i_src = f6i->fib6_src;
1022 #endif
1023 }
1024
1025 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1026                                         struct in6_addr *saddr)
1027 {
1028         struct fib6_node *pn, *sn;
1029         while (1) {
1030                 if (fn->fn_flags & RTN_TL_ROOT)
1031                         return NULL;
1032                 pn = rcu_dereference(fn->parent);
1033                 sn = FIB6_SUBTREE(pn);
1034                 if (sn && sn != fn)
1035                         fn = fib6_node_lookup(sn, NULL, saddr);
1036                 else
1037                         fn = pn;
1038                 if (fn->fn_flags & RTN_RTINFO)
1039                         return fn;
1040         }
1041 }
1042
1043 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1044 {
1045         struct rt6_info *rt = *prt;
1046
1047         if (dst_hold_safe(&rt->dst))
1048                 return true;
1049         if (net) {
1050                 rt = net->ipv6.ip6_null_entry;
1051                 dst_hold(&rt->dst);
1052         } else {
1053                 rt = NULL;
1054         }
1055         *prt = rt;
1056         return false;
1057 }
1058
1059 /* called with rcu_lock held */
1060 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1061 {
1062         struct net_device *dev = res->nh->fib_nh_dev;
1063         struct fib6_info *f6i = res->f6i;
1064         unsigned short flags;
1065         struct rt6_info *nrt;
1066
1067         if (!fib6_info_hold_safe(f6i))
1068                 goto fallback;
1069
1070         flags = fib6_info_dst_flags(f6i);
1071         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1072         if (!nrt) {
1073                 fib6_info_release(f6i);
1074                 goto fallback;
1075         }
1076
1077         ip6_rt_copy_init(nrt, res);
1078         return nrt;
1079
1080 fallback:
1081         nrt = dev_net(dev)->ipv6.ip6_null_entry;
1082         dst_hold(&nrt->dst);
1083         return nrt;
1084 }
1085
1086 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1087                                              struct fib6_table *table,
1088                                              struct flowi6 *fl6,
1089                                              const struct sk_buff *skb,
1090                                              int flags)
1091 {
1092         struct fib6_result res = {};
1093         struct fib6_node *fn;
1094         struct rt6_info *rt;
1095
1096         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1097                 flags &= ~RT6_LOOKUP_F_IFACE;
1098
1099         rcu_read_lock();
1100         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1101 restart:
1102         res.f6i = rcu_dereference(fn->leaf);
1103         if (!res.f6i)
1104                 res.f6i = net->ipv6.fib6_null_entry;
1105         else
1106                 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1107                                  flags);
1108
1109         if (res.f6i == net->ipv6.fib6_null_entry) {
1110                 fn = fib6_backtrack(fn, &fl6->saddr);
1111                 if (fn)
1112                         goto restart;
1113
1114                 rt = net->ipv6.ip6_null_entry;
1115                 dst_hold(&rt->dst);
1116                 goto out;
1117         }
1118
1119         fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1120                          fl6->flowi6_oif != 0, skb, flags);
1121
1122         /* Search through exception table */
1123         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1124         if (rt) {
1125                 if (ip6_hold_safe(net, &rt))
1126                         dst_use_noref(&rt->dst, jiffies);
1127         } else {
1128                 rt = ip6_create_rt_rcu(&res);
1129         }
1130
1131 out:
1132         trace_fib6_table_lookup(net, &res, table, fl6);
1133
1134         rcu_read_unlock();
1135
1136         return rt;
1137 }
1138
1139 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1140                                    const struct sk_buff *skb, int flags)
1141 {
1142         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1143 }
1144 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1145
1146 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1147                             const struct in6_addr *saddr, int oif,
1148                             const struct sk_buff *skb, int strict)
1149 {
1150         struct flowi6 fl6 = {
1151                 .flowi6_oif = oif,
1152                 .daddr = *daddr,
1153         };
1154         struct dst_entry *dst;
1155         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1156
1157         if (saddr) {
1158                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1159                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1160         }
1161
1162         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1163         if (dst->error == 0)
1164                 return (struct rt6_info *) dst;
1165
1166         dst_release(dst);
1167
1168         return NULL;
1169 }
1170 EXPORT_SYMBOL(rt6_lookup);
1171
1172 /* ip6_ins_rt is called with FREE table->tb6_lock.
1173  * It takes new route entry, the addition fails by any reason the
1174  * route is released.
1175  * Caller must hold dst before calling it.
1176  */
1177
1178 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1179                         struct netlink_ext_ack *extack)
1180 {
1181         int err;
1182         struct fib6_table *table;
1183
1184         table = rt->fib6_table;
1185         spin_lock_bh(&table->tb6_lock);
1186         err = fib6_add(&table->tb6_root, rt, info, extack);
1187         spin_unlock_bh(&table->tb6_lock);
1188
1189         return err;
1190 }
1191
1192 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1193 {
1194         struct nl_info info = { .nl_net = net, };
1195
1196         return __ip6_ins_rt(rt, &info, NULL);
1197 }
1198
1199 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1200                                            const struct in6_addr *daddr,
1201                                            const struct in6_addr *saddr)
1202 {
1203         struct fib6_info *f6i = res->f6i;
1204         struct net_device *dev;
1205         struct rt6_info *rt;
1206
1207         /*
1208          *      Clone the route.
1209          */
1210
1211         if (!fib6_info_hold_safe(f6i))
1212                 return NULL;
1213
1214         dev = ip6_rt_get_dev_rcu(res);
1215         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1216         if (!rt) {
1217                 fib6_info_release(f6i);
1218                 return NULL;
1219         }
1220
1221         ip6_rt_copy_init(rt, res);
1222         rt->rt6i_flags |= RTF_CACHE;
1223         rt->dst.flags |= DST_HOST;
1224         rt->rt6i_dst.addr = *daddr;
1225         rt->rt6i_dst.plen = 128;
1226
1227         if (!rt6_is_gw_or_nonexthop(res)) {
1228                 if (f6i->fib6_dst.plen != 128 &&
1229                     ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1230                         rt->rt6i_flags |= RTF_ANYCAST;
1231 #ifdef CONFIG_IPV6_SUBTREES
1232                 if (rt->rt6i_src.plen && saddr) {
1233                         rt->rt6i_src.addr = *saddr;
1234                         rt->rt6i_src.plen = 128;
1235                 }
1236 #endif
1237         }
1238
1239         return rt;
1240 }
1241
1242 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1243 {
1244         struct fib6_info *f6i = res->f6i;
1245         unsigned short flags = fib6_info_dst_flags(f6i);
1246         struct net_device *dev;
1247         struct rt6_info *pcpu_rt;
1248
1249         if (!fib6_info_hold_safe(f6i))
1250                 return NULL;
1251
1252         rcu_read_lock();
1253         dev = ip6_rt_get_dev_rcu(res);
1254         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1255         rcu_read_unlock();
1256         if (!pcpu_rt) {
1257                 fib6_info_release(f6i);
1258                 return NULL;
1259         }
1260         ip6_rt_copy_init(pcpu_rt, res);
1261         pcpu_rt->rt6i_flags |= RTF_PCPU;
1262         return pcpu_rt;
1263 }
1264
1265 /* It should be called with rcu_read_lock() acquired */
1266 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1267 {
1268         struct rt6_info *pcpu_rt, **p;
1269
1270         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1271         pcpu_rt = *p;
1272
1273         if (pcpu_rt)
1274                 ip6_hold_safe(NULL, &pcpu_rt);
1275
1276         return pcpu_rt;
1277 }
1278
1279 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1280                                             const struct fib6_result *res)
1281 {
1282         struct rt6_info *pcpu_rt, *prev, **p;
1283
1284         pcpu_rt = ip6_rt_pcpu_alloc(res);
1285         if (!pcpu_rt) {
1286                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1287                 return net->ipv6.ip6_null_entry;
1288         }
1289
1290         dst_hold(&pcpu_rt->dst);
1291         p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1292         prev = cmpxchg(p, NULL, pcpu_rt);
1293         BUG_ON(prev);
1294
1295         if (res->f6i->fib6_destroying) {
1296                 struct fib6_info *from;
1297
1298                 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1299                 fib6_info_release(from);
1300         }
1301
1302         return pcpu_rt;
1303 }
1304
1305 /* exception hash table implementation
1306  */
1307 static DEFINE_SPINLOCK(rt6_exception_lock);
1308
1309 /* Remove rt6_ex from hash table and free the memory
1310  * Caller must hold rt6_exception_lock
1311  */
1312 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1313                                  struct rt6_exception *rt6_ex)
1314 {
1315         struct fib6_info *from;
1316         struct net *net;
1317
1318         if (!bucket || !rt6_ex)
1319                 return;
1320
1321         net = dev_net(rt6_ex->rt6i->dst.dev);
1322         net->ipv6.rt6_stats->fib_rt_cache--;
1323
1324         /* purge completely the exception to allow releasing the held resources:
1325          * some [sk] cache may keep the dst around for unlimited time
1326          */
1327         from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1328         fib6_info_release(from);
1329         dst_dev_put(&rt6_ex->rt6i->dst);
1330
1331         hlist_del_rcu(&rt6_ex->hlist);
1332         dst_release(&rt6_ex->rt6i->dst);
1333         kfree_rcu(rt6_ex, rcu);
1334         WARN_ON_ONCE(!bucket->depth);
1335         bucket->depth--;
1336 }
1337
1338 /* Remove oldest rt6_ex in bucket and free the memory
1339  * Caller must hold rt6_exception_lock
1340  */
1341 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1342 {
1343         struct rt6_exception *rt6_ex, *oldest = NULL;
1344
1345         if (!bucket)
1346                 return;
1347
1348         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1349                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1350                         oldest = rt6_ex;
1351         }
1352         rt6_remove_exception(bucket, oldest);
1353 }
1354
1355 static u32 rt6_exception_hash(const struct in6_addr *dst,
1356                               const struct in6_addr *src)
1357 {
1358         static u32 seed __read_mostly;
1359         u32 val;
1360
1361         net_get_random_once(&seed, sizeof(seed));
1362         val = jhash(dst, sizeof(*dst), seed);
1363
1364 #ifdef CONFIG_IPV6_SUBTREES
1365         if (src)
1366                 val = jhash(src, sizeof(*src), val);
1367 #endif
1368         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1369 }
1370
1371 /* Helper function to find the cached rt in the hash table
1372  * and update bucket pointer to point to the bucket for this
1373  * (daddr, saddr) pair
1374  * Caller must hold rt6_exception_lock
1375  */
1376 static struct rt6_exception *
1377 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1378                               const struct in6_addr *daddr,
1379                               const struct in6_addr *saddr)
1380 {
1381         struct rt6_exception *rt6_ex;
1382         u32 hval;
1383
1384         if (!(*bucket) || !daddr)
1385                 return NULL;
1386
1387         hval = rt6_exception_hash(daddr, saddr);
1388         *bucket += hval;
1389
1390         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1391                 struct rt6_info *rt6 = rt6_ex->rt6i;
1392                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1393
1394 #ifdef CONFIG_IPV6_SUBTREES
1395                 if (matched && saddr)
1396                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1397 #endif
1398                 if (matched)
1399                         return rt6_ex;
1400         }
1401         return NULL;
1402 }
1403
1404 /* Helper function to find the cached rt in the hash table
1405  * and update bucket pointer to point to the bucket for this
1406  * (daddr, saddr) pair
1407  * Caller must hold rcu_read_lock()
1408  */
1409 static struct rt6_exception *
1410 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1411                          const struct in6_addr *daddr,
1412                          const struct in6_addr *saddr)
1413 {
1414         struct rt6_exception *rt6_ex;
1415         u32 hval;
1416
1417         WARN_ON_ONCE(!rcu_read_lock_held());
1418
1419         if (!(*bucket) || !daddr)
1420                 return NULL;
1421
1422         hval = rt6_exception_hash(daddr, saddr);
1423         *bucket += hval;
1424
1425         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1426                 struct rt6_info *rt6 = rt6_ex->rt6i;
1427                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1428
1429 #ifdef CONFIG_IPV6_SUBTREES
1430                 if (matched && saddr)
1431                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1432 #endif
1433                 if (matched)
1434                         return rt6_ex;
1435         }
1436         return NULL;
1437 }
1438
1439 static unsigned int fib6_mtu(const struct fib6_result *res)
1440 {
1441         const struct fib6_nh *nh = res->nh;
1442         unsigned int mtu;
1443
1444         if (res->f6i->fib6_pmtu) {
1445                 mtu = res->f6i->fib6_pmtu;
1446         } else {
1447                 struct net_device *dev = nh->fib_nh_dev;
1448                 struct inet6_dev *idev;
1449
1450                 rcu_read_lock();
1451                 idev = __in6_dev_get(dev);
1452                 mtu = idev->cnf.mtu6;
1453                 rcu_read_unlock();
1454         }
1455
1456         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1457
1458         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1459 }
1460
1461 static int rt6_insert_exception(struct rt6_info *nrt,
1462                                 const struct fib6_result *res)
1463 {
1464         struct net *net = dev_net(nrt->dst.dev);
1465         struct rt6_exception_bucket *bucket;
1466         struct in6_addr *src_key = NULL;
1467         struct rt6_exception *rt6_ex;
1468         struct fib6_info *f6i = res->f6i;
1469         int err = 0;
1470
1471         spin_lock_bh(&rt6_exception_lock);
1472
1473         if (f6i->exception_bucket_flushed) {
1474                 err = -EINVAL;
1475                 goto out;
1476         }
1477
1478         bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1479                                         lockdep_is_held(&rt6_exception_lock));
1480         if (!bucket) {
1481                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1482                                  GFP_ATOMIC);
1483                 if (!bucket) {
1484                         err = -ENOMEM;
1485                         goto out;
1486                 }
1487                 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1488         }
1489
1490 #ifdef CONFIG_IPV6_SUBTREES
1491         /* fib6_src.plen != 0 indicates f6i is in subtree
1492          * and exception table is indexed by a hash of
1493          * both fib6_dst and fib6_src.
1494          * Otherwise, the exception table is indexed by
1495          * a hash of only fib6_dst.
1496          */
1497         if (f6i->fib6_src.plen)
1498                 src_key = &nrt->rt6i_src.addr;
1499 #endif
1500         /* rt6_mtu_change() might lower mtu on f6i.
1501          * Only insert this exception route if its mtu
1502          * is less than f6i's mtu value.
1503          */
1504         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1505                 err = -EINVAL;
1506                 goto out;
1507         }
1508
1509         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1510                                                src_key);
1511         if (rt6_ex)
1512                 rt6_remove_exception(bucket, rt6_ex);
1513
1514         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1515         if (!rt6_ex) {
1516                 err = -ENOMEM;
1517                 goto out;
1518         }
1519         rt6_ex->rt6i = nrt;
1520         rt6_ex->stamp = jiffies;
1521         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1522         bucket->depth++;
1523         net->ipv6.rt6_stats->fib_rt_cache++;
1524
1525         if (bucket->depth > FIB6_MAX_DEPTH)
1526                 rt6_exception_remove_oldest(bucket);
1527
1528 out:
1529         spin_unlock_bh(&rt6_exception_lock);
1530
1531         /* Update fn->fn_sernum to invalidate all cached dst */
1532         if (!err) {
1533                 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1534                 fib6_update_sernum(net, f6i);
1535                 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1536                 fib6_force_start_gc(net);
1537         }
1538
1539         return err;
1540 }
1541
1542 void rt6_flush_exceptions(struct fib6_info *rt)
1543 {
1544         struct rt6_exception_bucket *bucket;
1545         struct rt6_exception *rt6_ex;
1546         struct hlist_node *tmp;
1547         int i;
1548
1549         spin_lock_bh(&rt6_exception_lock);
1550         /* Prevent rt6_insert_exception() to recreate the bucket list */
1551         rt->exception_bucket_flushed = 1;
1552
1553         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1554                                     lockdep_is_held(&rt6_exception_lock));
1555         if (!bucket)
1556                 goto out;
1557
1558         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1559                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1560                         rt6_remove_exception(bucket, rt6_ex);
1561                 WARN_ON_ONCE(bucket->depth);
1562                 bucket++;
1563         }
1564
1565 out:
1566         spin_unlock_bh(&rt6_exception_lock);
1567 }
1568
1569 /* Find cached rt in the hash table inside passed in rt
1570  * Caller has to hold rcu_read_lock()
1571  */
1572 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1573                                            const struct in6_addr *daddr,
1574                                            const struct in6_addr *saddr)
1575 {
1576         const struct in6_addr *src_key = NULL;
1577         struct rt6_exception_bucket *bucket;
1578         struct rt6_exception *rt6_ex;
1579         struct rt6_info *ret = NULL;
1580
1581 #ifdef CONFIG_IPV6_SUBTREES
1582         /* fib6i_src.plen != 0 indicates f6i is in subtree
1583          * and exception table is indexed by a hash of
1584          * both fib6_dst and fib6_src.
1585          * However, the src addr used to create the hash
1586          * might not be exactly the passed in saddr which
1587          * is a /128 addr from the flow.
1588          * So we need to use f6i->fib6_src to redo lookup
1589          * if the passed in saddr does not find anything.
1590          * (See the logic in ip6_rt_cache_alloc() on how
1591          * rt->rt6i_src is updated.)
1592          */
1593         if (res->f6i->fib6_src.plen)
1594                 src_key = saddr;
1595 find_ex:
1596 #endif
1597         bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1598         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1599
1600         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1601                 ret = rt6_ex->rt6i;
1602
1603 #ifdef CONFIG_IPV6_SUBTREES
1604         /* Use fib6_src as src_key and redo lookup */
1605         if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1606                 src_key = &res->f6i->fib6_src.addr;
1607                 goto find_ex;
1608         }
1609 #endif
1610
1611         return ret;
1612 }
1613
1614 /* Remove the passed in cached rt from the hash table that contains it */
1615 static int rt6_remove_exception_rt(struct rt6_info *rt)
1616 {
1617         struct rt6_exception_bucket *bucket;
1618         struct in6_addr *src_key = NULL;
1619         struct rt6_exception *rt6_ex;
1620         struct fib6_info *from;
1621         int err;
1622
1623         from = rcu_dereference(rt->from);
1624         if (!from ||
1625             !(rt->rt6i_flags & RTF_CACHE))
1626                 return -EINVAL;
1627
1628         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1629                 return -ENOENT;
1630
1631         spin_lock_bh(&rt6_exception_lock);
1632         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1633                                     lockdep_is_held(&rt6_exception_lock));
1634 #ifdef CONFIG_IPV6_SUBTREES
1635         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1636          * and exception table is indexed by a hash of
1637          * both rt6i_dst and rt6i_src.
1638          * Otherwise, the exception table is indexed by
1639          * a hash of only rt6i_dst.
1640          */
1641         if (from->fib6_src.plen)
1642                 src_key = &rt->rt6i_src.addr;
1643 #endif
1644         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1645                                                &rt->rt6i_dst.addr,
1646                                                src_key);
1647         if (rt6_ex) {
1648                 rt6_remove_exception(bucket, rt6_ex);
1649                 err = 0;
1650         } else {
1651                 err = -ENOENT;
1652         }
1653
1654         spin_unlock_bh(&rt6_exception_lock);
1655         return err;
1656 }
1657
1658 /* Find rt6_ex which contains the passed in rt cache and
1659  * refresh its stamp
1660  */
1661 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1662 {
1663         struct rt6_exception_bucket *bucket;
1664         struct in6_addr *src_key = NULL;
1665         struct rt6_exception *rt6_ex;
1666         struct fib6_info *from;
1667
1668         rcu_read_lock();
1669         from = rcu_dereference(rt->from);
1670         if (!from || !(rt->rt6i_flags & RTF_CACHE))
1671                 goto unlock;
1672
1673         bucket = rcu_dereference(from->rt6i_exception_bucket);
1674
1675 #ifdef CONFIG_IPV6_SUBTREES
1676         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1677          * and exception table is indexed by a hash of
1678          * both rt6i_dst and rt6i_src.
1679          * Otherwise, the exception table is indexed by
1680          * a hash of only rt6i_dst.
1681          */
1682         if (from->fib6_src.plen)
1683                 src_key = &rt->rt6i_src.addr;
1684 #endif
1685         rt6_ex = __rt6_find_exception_rcu(&bucket,
1686                                           &rt->rt6i_dst.addr,
1687                                           src_key);
1688         if (rt6_ex)
1689                 rt6_ex->stamp = jiffies;
1690
1691 unlock:
1692         rcu_read_unlock();
1693 }
1694
1695 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1696                                          struct rt6_info *rt, int mtu)
1697 {
1698         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1699          * lowest MTU in the path: always allow updating the route PMTU to
1700          * reflect PMTU decreases.
1701          *
1702          * If the new MTU is higher, and the route PMTU is equal to the local
1703          * MTU, this means the old MTU is the lowest in the path, so allow
1704          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1705          * handle this.
1706          */
1707
1708         if (dst_mtu(&rt->dst) >= mtu)
1709                 return true;
1710
1711         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1712                 return true;
1713
1714         return false;
1715 }
1716
1717 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1718                                        struct fib6_info *rt, int mtu)
1719 {
1720         struct rt6_exception_bucket *bucket;
1721         struct rt6_exception *rt6_ex;
1722         int i;
1723
1724         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1725                                         lockdep_is_held(&rt6_exception_lock));
1726
1727         if (!bucket)
1728                 return;
1729
1730         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1731                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1732                         struct rt6_info *entry = rt6_ex->rt6i;
1733
1734                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1735                          * route), the metrics of its rt->from have already
1736                          * been updated.
1737                          */
1738                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1739                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1740                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1741                 }
1742                 bucket++;
1743         }
1744 }
1745
1746 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1747
1748 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1749                                         struct in6_addr *gateway)
1750 {
1751         struct rt6_exception_bucket *bucket;
1752         struct rt6_exception *rt6_ex;
1753         struct hlist_node *tmp;
1754         int i;
1755
1756         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1757                 return;
1758
1759         spin_lock_bh(&rt6_exception_lock);
1760         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1761                                      lockdep_is_held(&rt6_exception_lock));
1762
1763         if (bucket) {
1764                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1765                         hlist_for_each_entry_safe(rt6_ex, tmp,
1766                                                   &bucket->chain, hlist) {
1767                                 struct rt6_info *entry = rt6_ex->rt6i;
1768
1769                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1770                                     RTF_CACHE_GATEWAY &&
1771                                     ipv6_addr_equal(gateway,
1772                                                     &entry->rt6i_gateway)) {
1773                                         rt6_remove_exception(bucket, rt6_ex);
1774                                 }
1775                         }
1776                         bucket++;
1777                 }
1778         }
1779
1780         spin_unlock_bh(&rt6_exception_lock);
1781 }
1782
1783 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1784                                       struct rt6_exception *rt6_ex,
1785                                       struct fib6_gc_args *gc_args,
1786                                       unsigned long now)
1787 {
1788         struct rt6_info *rt = rt6_ex->rt6i;
1789
1790         /* we are pruning and obsoleting aged-out and non gateway exceptions
1791          * even if others have still references to them, so that on next
1792          * dst_check() such references can be dropped.
1793          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1794          * expired, independently from their aging, as per RFC 8201 section 4
1795          */
1796         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1797                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1798                         RT6_TRACE("aging clone %p\n", rt);
1799                         rt6_remove_exception(bucket, rt6_ex);
1800                         return;
1801                 }
1802         } else if (time_after(jiffies, rt->dst.expires)) {
1803                 RT6_TRACE("purging expired route %p\n", rt);
1804                 rt6_remove_exception(bucket, rt6_ex);
1805                 return;
1806         }
1807
1808         if (rt->rt6i_flags & RTF_GATEWAY) {
1809                 struct neighbour *neigh;
1810                 __u8 neigh_flags = 0;
1811
1812                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1813                 if (neigh)
1814                         neigh_flags = neigh->flags;
1815
1816                 if (!(neigh_flags & NTF_ROUTER)) {
1817                         RT6_TRACE("purging route %p via non-router but gateway\n",
1818                                   rt);
1819                         rt6_remove_exception(bucket, rt6_ex);
1820                         return;
1821                 }
1822         }
1823
1824         gc_args->more++;
1825 }
1826
1827 void rt6_age_exceptions(struct fib6_info *rt,
1828                         struct fib6_gc_args *gc_args,
1829                         unsigned long now)
1830 {
1831         struct rt6_exception_bucket *bucket;
1832         struct rt6_exception *rt6_ex;
1833         struct hlist_node *tmp;
1834         int i;
1835
1836         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1837                 return;
1838
1839         rcu_read_lock_bh();
1840         spin_lock(&rt6_exception_lock);
1841         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1842                                     lockdep_is_held(&rt6_exception_lock));
1843
1844         if (bucket) {
1845                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1846                         hlist_for_each_entry_safe(rt6_ex, tmp,
1847                                                   &bucket->chain, hlist) {
1848                                 rt6_age_examine_exception(bucket, rt6_ex,
1849                                                           gc_args, now);
1850                         }
1851                         bucket++;
1852                 }
1853         }
1854         spin_unlock(&rt6_exception_lock);
1855         rcu_read_unlock_bh();
1856 }
1857
1858 /* must be called with rcu lock held */
1859 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1860                       struct flowi6 *fl6, struct fib6_result *res, int strict)
1861 {
1862         struct fib6_node *fn, *saved_fn;
1863
1864         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1865         saved_fn = fn;
1866
1867         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1868                 oif = 0;
1869
1870 redo_rt6_select:
1871         rt6_select(net, fn, oif, res, strict);
1872         if (res->f6i == net->ipv6.fib6_null_entry) {
1873                 fn = fib6_backtrack(fn, &fl6->saddr);
1874                 if (fn)
1875                         goto redo_rt6_select;
1876                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1877                         /* also consider unreachable route */
1878                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1879                         fn = saved_fn;
1880                         goto redo_rt6_select;
1881                 }
1882         }
1883
1884         trace_fib6_table_lookup(net, res, table, fl6);
1885
1886         return 0;
1887 }
1888
1889 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1890                                int oif, struct flowi6 *fl6,
1891                                const struct sk_buff *skb, int flags)
1892 {
1893         struct fib6_result res = {};
1894         struct rt6_info *rt;
1895         int strict = 0;
1896
1897         strict |= flags & RT6_LOOKUP_F_IFACE;
1898         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1899         if (net->ipv6.devconf_all->forwarding == 0)
1900                 strict |= RT6_LOOKUP_F_REACHABLE;
1901
1902         rcu_read_lock();
1903
1904         fib6_table_lookup(net, table, oif, fl6, &res, strict);
1905         if (res.f6i == net->ipv6.fib6_null_entry) {
1906                 rt = net->ipv6.ip6_null_entry;
1907                 rcu_read_unlock();
1908                 dst_hold(&rt->dst);
1909                 return rt;
1910         }
1911
1912         fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1913
1914         /*Search through exception table */
1915         rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1916         if (rt) {
1917                 if (ip6_hold_safe(net, &rt))
1918                         dst_use_noref(&rt->dst, jiffies);
1919
1920                 rcu_read_unlock();
1921                 return rt;
1922         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1923                             !res.nh->fib_nh_gw_family)) {
1924                 /* Create a RTF_CACHE clone which will not be
1925                  * owned by the fib6 tree.  It is for the special case where
1926                  * the daddr in the skb during the neighbor look-up is different
1927                  * from the fl6->daddr used to look-up route here.
1928                  */
1929                 struct rt6_info *uncached_rt;
1930
1931                 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1932
1933                 rcu_read_unlock();
1934
1935                 if (uncached_rt) {
1936                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1937                          * No need for another dst_hold()
1938                          */
1939                         rt6_uncached_list_add(uncached_rt);
1940                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1941                 } else {
1942                         uncached_rt = net->ipv6.ip6_null_entry;
1943                         dst_hold(&uncached_rt->dst);
1944                 }
1945
1946                 return uncached_rt;
1947         } else {
1948                 /* Get a percpu copy */
1949
1950                 struct rt6_info *pcpu_rt;
1951
1952                 local_bh_disable();
1953                 pcpu_rt = rt6_get_pcpu_route(&res);
1954
1955                 if (!pcpu_rt)
1956                         pcpu_rt = rt6_make_pcpu_route(net, &res);
1957
1958                 local_bh_enable();
1959                 rcu_read_unlock();
1960
1961                 return pcpu_rt;
1962         }
1963 }
1964 EXPORT_SYMBOL_GPL(ip6_pol_route);
1965
1966 static struct rt6_info *ip6_pol_route_input(struct net *net,
1967                                             struct fib6_table *table,
1968                                             struct flowi6 *fl6,
1969                                             const struct sk_buff *skb,
1970                                             int flags)
1971 {
1972         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1973 }
1974
1975 struct dst_entry *ip6_route_input_lookup(struct net *net,
1976                                          struct net_device *dev,
1977                                          struct flowi6 *fl6,
1978                                          const struct sk_buff *skb,
1979                                          int flags)
1980 {
1981         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1982                 flags |= RT6_LOOKUP_F_IFACE;
1983
1984         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1985 }
1986 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1987
1988 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1989                                   struct flow_keys *keys,
1990                                   struct flow_keys *flkeys)
1991 {
1992         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1993         const struct ipv6hdr *key_iph = outer_iph;
1994         struct flow_keys *_flkeys = flkeys;
1995         const struct ipv6hdr *inner_iph;
1996         const struct icmp6hdr *icmph;
1997         struct ipv6hdr _inner_iph;
1998         struct icmp6hdr _icmph;
1999
2000         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2001                 goto out;
2002
2003         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2004                                    sizeof(_icmph), &_icmph);
2005         if (!icmph)
2006                 goto out;
2007
2008         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2009             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2010             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2011             icmph->icmp6_type != ICMPV6_PARAMPROB)
2012                 goto out;
2013
2014         inner_iph = skb_header_pointer(skb,
2015                                        skb_transport_offset(skb) + sizeof(*icmph),
2016                                        sizeof(_inner_iph), &_inner_iph);
2017         if (!inner_iph)
2018                 goto out;
2019
2020         key_iph = inner_iph;
2021         _flkeys = NULL;
2022 out:
2023         if (_flkeys) {
2024                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2025                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2026                 keys->tags.flow_label = _flkeys->tags.flow_label;
2027                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2028         } else {
2029                 keys->addrs.v6addrs.src = key_iph->saddr;
2030                 keys->addrs.v6addrs.dst = key_iph->daddr;
2031                 keys->tags.flow_label = ip6_flowlabel(key_iph);
2032                 keys->basic.ip_proto = key_iph->nexthdr;
2033         }
2034 }
2035
2036 /* if skb is set it will be used and fl6 can be NULL */
2037 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2038                        const struct sk_buff *skb, struct flow_keys *flkeys)
2039 {
2040         struct flow_keys hash_keys;
2041         u32 mhash;
2042
2043         switch (ip6_multipath_hash_policy(net)) {
2044         case 0:
2045                 memset(&hash_keys, 0, sizeof(hash_keys));
2046                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2047                 if (skb) {
2048                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2049                 } else {
2050                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2051                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2052                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2053                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2054                 }
2055                 break;
2056         case 1:
2057                 if (skb) {
2058                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2059                         struct flow_keys keys;
2060
2061                         /* short-circuit if we already have L4 hash present */
2062                         if (skb->l4_hash)
2063                                 return skb_get_hash_raw(skb) >> 1;
2064
2065                         memset(&hash_keys, 0, sizeof(hash_keys));
2066
2067                         if (!flkeys) {
2068                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2069                                 flkeys = &keys;
2070                         }
2071                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2072                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2073                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2074                         hash_keys.ports.src = flkeys->ports.src;
2075                         hash_keys.ports.dst = flkeys->ports.dst;
2076                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2077                 } else {
2078                         memset(&hash_keys, 0, sizeof(hash_keys));
2079                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2080                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2081                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2082                         hash_keys.ports.src = fl6->fl6_sport;
2083                         hash_keys.ports.dst = fl6->fl6_dport;
2084                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2085                 }
2086                 break;
2087         }
2088         mhash = flow_hash_from_keys(&hash_keys);
2089
2090         return mhash >> 1;
2091 }
2092
2093 void ip6_route_input(struct sk_buff *skb)
2094 {
2095         const struct ipv6hdr *iph = ipv6_hdr(skb);
2096         struct net *net = dev_net(skb->dev);
2097         int flags = RT6_LOOKUP_F_HAS_SADDR;
2098         struct ip_tunnel_info *tun_info;
2099         struct flowi6 fl6 = {
2100                 .flowi6_iif = skb->dev->ifindex,
2101                 .daddr = iph->daddr,
2102                 .saddr = iph->saddr,
2103                 .flowlabel = ip6_flowinfo(iph),
2104                 .flowi6_mark = skb->mark,
2105                 .flowi6_proto = iph->nexthdr,
2106         };
2107         struct flow_keys *flkeys = NULL, _flkeys;
2108
2109         tun_info = skb_tunnel_info(skb);
2110         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2111                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2112
2113         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2114                 flkeys = &_flkeys;
2115
2116         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2117                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2118         skb_dst_drop(skb);
2119         skb_dst_set(skb,
2120                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2121 }
2122
2123 static struct rt6_info *ip6_pol_route_output(struct net *net,
2124                                              struct fib6_table *table,
2125                                              struct flowi6 *fl6,
2126                                              const struct sk_buff *skb,
2127                                              int flags)
2128 {
2129         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2130 }
2131
2132 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2133                                          struct flowi6 *fl6, int flags)
2134 {
2135         bool any_src;
2136
2137         if (ipv6_addr_type(&fl6->daddr) &
2138             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2139                 struct dst_entry *dst;
2140
2141                 dst = l3mdev_link_scope_lookup(net, fl6);
2142                 if (dst)
2143                         return dst;
2144         }
2145
2146         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2147
2148         any_src = ipv6_addr_any(&fl6->saddr);
2149         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2150             (fl6->flowi6_oif && any_src))
2151                 flags |= RT6_LOOKUP_F_IFACE;
2152
2153         if (!any_src)
2154                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2155         else if (sk)
2156                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2157
2158         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2159 }
2160 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2161
2162 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2163 {
2164         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2165         struct net_device *loopback_dev = net->loopback_dev;
2166         struct dst_entry *new = NULL;
2167
2168         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2169                        DST_OBSOLETE_DEAD, 0);
2170         if (rt) {
2171                 rt6_info_init(rt);
2172                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2173
2174                 new = &rt->dst;
2175                 new->__use = 1;
2176                 new->input = dst_discard;
2177                 new->output = dst_discard_out;
2178
2179                 dst_copy_metrics(new, &ort->dst);
2180
2181                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2182                 rt->rt6i_gateway = ort->rt6i_gateway;
2183                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2184
2185                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2186 #ifdef CONFIG_IPV6_SUBTREES
2187                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2188 #endif
2189         }
2190
2191         dst_release(dst_orig);
2192         return new ? new : ERR_PTR(-ENOMEM);
2193 }
2194
2195 /*
2196  *      Destination cache support functions
2197  */
2198
2199 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2200 {
2201         u32 rt_cookie = 0;
2202
2203         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2204                 return false;
2205
2206         if (fib6_check_expired(f6i))
2207                 return false;
2208
2209         return true;
2210 }
2211
2212 static struct dst_entry *rt6_check(struct rt6_info *rt,
2213                                    struct fib6_info *from,
2214                                    u32 cookie)
2215 {
2216         u32 rt_cookie = 0;
2217
2218         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2219             rt_cookie != cookie)
2220                 return NULL;
2221
2222         if (rt6_check_expired(rt))
2223                 return NULL;
2224
2225         return &rt->dst;
2226 }
2227
2228 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2229                                             struct fib6_info *from,
2230                                             u32 cookie)
2231 {
2232         if (!__rt6_check_expired(rt) &&
2233             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2234             fib6_check(from, cookie))
2235                 return &rt->dst;
2236         else
2237                 return NULL;
2238 }
2239
2240 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2241 {
2242         struct dst_entry *dst_ret;
2243         struct fib6_info *from;
2244         struct rt6_info *rt;
2245
2246         rt = container_of(dst, struct rt6_info, dst);
2247
2248         rcu_read_lock();
2249
2250         /* All IPV6 dsts are created with ->obsolete set to the value
2251          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2252          * into this function always.
2253          */
2254
2255         from = rcu_dereference(rt->from);
2256
2257         if (from && (rt->rt6i_flags & RTF_PCPU ||
2258             unlikely(!list_empty(&rt->rt6i_uncached))))
2259                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2260         else
2261                 dst_ret = rt6_check(rt, from, cookie);
2262
2263         rcu_read_unlock();
2264
2265         return dst_ret;
2266 }
2267
2268 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2269 {
2270         struct rt6_info *rt = (struct rt6_info *) dst;
2271
2272         if (rt) {
2273                 if (rt->rt6i_flags & RTF_CACHE) {
2274                         rcu_read_lock();
2275                         if (rt6_check_expired(rt)) {
2276                                 rt6_remove_exception_rt(rt);
2277                                 dst = NULL;
2278                         }
2279                         rcu_read_unlock();
2280                 } else {
2281                         dst_release(dst);
2282                         dst = NULL;
2283                 }
2284         }
2285         return dst;
2286 }
2287
2288 static void ip6_link_failure(struct sk_buff *skb)
2289 {
2290         struct rt6_info *rt;
2291
2292         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2293
2294         rt = (struct rt6_info *) skb_dst(skb);
2295         if (rt) {
2296                 rcu_read_lock();
2297                 if (rt->rt6i_flags & RTF_CACHE) {
2298                         rt6_remove_exception_rt(rt);
2299                 } else {
2300                         struct fib6_info *from;
2301                         struct fib6_node *fn;
2302
2303                         from = rcu_dereference(rt->from);
2304                         if (from) {
2305                                 fn = rcu_dereference(from->fib6_node);
2306                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2307                                         fn->fn_sernum = -1;
2308                         }
2309                 }
2310                 rcu_read_unlock();
2311         }
2312 }
2313
2314 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2315 {
2316         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2317                 struct fib6_info *from;
2318
2319                 rcu_read_lock();
2320                 from = rcu_dereference(rt0->from);
2321                 if (from)
2322                         rt0->dst.expires = from->expires;
2323                 rcu_read_unlock();
2324         }
2325
2326         dst_set_expires(&rt0->dst, timeout);
2327         rt0->rt6i_flags |= RTF_EXPIRES;
2328 }
2329
2330 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2331 {
2332         struct net *net = dev_net(rt->dst.dev);
2333
2334         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2335         rt->rt6i_flags |= RTF_MODIFIED;
2336         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2337 }
2338
2339 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2340 {
2341         return !(rt->rt6i_flags & RTF_CACHE) &&
2342                 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2343 }
2344
2345 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2346                                  const struct ipv6hdr *iph, u32 mtu)
2347 {
2348         const struct in6_addr *daddr, *saddr;
2349         struct rt6_info *rt6 = (struct rt6_info *)dst;
2350
2351         if (dst_metric_locked(dst, RTAX_MTU))
2352                 return;
2353
2354         if (iph) {
2355                 daddr = &iph->daddr;
2356                 saddr = &iph->saddr;
2357         } else if (sk) {
2358                 daddr = &sk->sk_v6_daddr;
2359                 saddr = &inet6_sk(sk)->saddr;
2360         } else {
2361                 daddr = NULL;
2362                 saddr = NULL;
2363         }
2364         dst_confirm_neigh(dst, daddr);
2365         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2366         if (mtu >= dst_mtu(dst))
2367                 return;
2368
2369         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2370                 rt6_do_update_pmtu(rt6, mtu);
2371                 /* update rt6_ex->stamp for cache */
2372                 if (rt6->rt6i_flags & RTF_CACHE)
2373                         rt6_update_exception_stamp_rt(rt6);
2374         } else if (daddr) {
2375                 struct fib6_result res = {};
2376                 struct rt6_info *nrt6;
2377
2378                 rcu_read_lock();
2379                 res.f6i = rcu_dereference(rt6->from);
2380                 if (!res.f6i) {
2381                         rcu_read_unlock();
2382                         return;
2383                 }
2384                 res.nh = &res.f6i->fib6_nh;
2385                 res.fib6_flags = res.f6i->fib6_flags;
2386                 res.fib6_type = res.f6i->fib6_type;
2387
2388                 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2389                 if (nrt6) {
2390                         rt6_do_update_pmtu(nrt6, mtu);
2391                         if (rt6_insert_exception(nrt6, &res))
2392                                 dst_release_immediate(&nrt6->dst);
2393                 }
2394                 rcu_read_unlock();
2395         }
2396 }
2397
2398 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2399                                struct sk_buff *skb, u32 mtu)
2400 {
2401         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2402 }
2403
2404 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2405                      int oif, u32 mark, kuid_t uid)
2406 {
2407         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2408         struct dst_entry *dst;
2409         struct flowi6 fl6 = {
2410                 .flowi6_oif = oif,
2411                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2412                 .daddr = iph->daddr,
2413                 .saddr = iph->saddr,
2414                 .flowlabel = ip6_flowinfo(iph),
2415                 .flowi6_uid = uid,
2416         };
2417
2418         dst = ip6_route_output(net, NULL, &fl6);
2419         if (!dst->error)
2420                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2421         dst_release(dst);
2422 }
2423 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2424
2425 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2426 {
2427         int oif = sk->sk_bound_dev_if;
2428         struct dst_entry *dst;
2429
2430         if (!oif && skb->dev)
2431                 oif = l3mdev_master_ifindex(skb->dev);
2432
2433         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2434
2435         dst = __sk_dst_get(sk);
2436         if (!dst || !dst->obsolete ||
2437             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2438                 return;
2439
2440         bh_lock_sock(sk);
2441         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2442                 ip6_datagram_dst_update(sk, false);
2443         bh_unlock_sock(sk);
2444 }
2445 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2446
2447 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2448                            const struct flowi6 *fl6)
2449 {
2450 #ifdef CONFIG_IPV6_SUBTREES
2451         struct ipv6_pinfo *np = inet6_sk(sk);
2452 #endif
2453
2454         ip6_dst_store(sk, dst,
2455                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2456                       &sk->sk_v6_daddr : NULL,
2457 #ifdef CONFIG_IPV6_SUBTREES
2458                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2459                       &np->saddr :
2460 #endif
2461                       NULL);
2462 }
2463
2464 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2465                                   struct flowi6 *fl6,
2466                                   const struct in6_addr *gw,
2467                                   struct rt6_info **ret)
2468 {
2469         const struct fib6_nh *nh = res->nh;
2470
2471         if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2472             fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2473                 return false;
2474
2475         /* rt_cache's gateway might be different from its 'parent'
2476          * in the case of an ip redirect.
2477          * So we keep searching in the exception table if the gateway
2478          * is different.
2479          */
2480         if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2481                 struct rt6_info *rt_cache;
2482
2483                 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2484                 if (rt_cache &&
2485                     ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2486                         *ret = rt_cache;
2487                         return true;
2488                 }
2489                 return false;
2490         }
2491         return true;
2492 }
2493
2494 /* Handle redirects */
2495 struct ip6rd_flowi {
2496         struct flowi6 fl6;
2497         struct in6_addr gateway;
2498 };
2499
2500 static struct rt6_info *__ip6_route_redirect(struct net *net,
2501                                              struct fib6_table *table,
2502                                              struct flowi6 *fl6,
2503                                              const struct sk_buff *skb,
2504                                              int flags)
2505 {
2506         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2507         struct rt6_info *ret = NULL;
2508         struct fib6_result res = {};
2509         struct fib6_info *rt;
2510         struct fib6_node *fn;
2511
2512         /* l3mdev_update_flow overrides oif if the device is enslaved; in
2513          * this case we must match on the real ingress device, so reset it
2514          */
2515         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2516                 fl6->flowi6_oif = skb->dev->ifindex;
2517
2518         /* Get the "current" route for this destination and
2519          * check if the redirect has come from appropriate router.
2520          *
2521          * RFC 4861 specifies that redirects should only be
2522          * accepted if they come from the nexthop to the target.
2523          * Due to the way the routes are chosen, this notion
2524          * is a bit fuzzy and one might need to check all possible
2525          * routes.
2526          */
2527
2528         rcu_read_lock();
2529         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2530 restart:
2531         for_each_fib6_node_rt_rcu(fn) {
2532                 res.f6i = rt;
2533                 res.nh = &rt->fib6_nh;
2534
2535                 if (fib6_check_expired(rt))
2536                         continue;
2537                 if (rt->fib6_flags & RTF_REJECT)
2538                         break;
2539                 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2540                         goto out;
2541         }
2542
2543         if (!rt)
2544                 rt = net->ipv6.fib6_null_entry;
2545         else if (rt->fib6_flags & RTF_REJECT) {
2546                 ret = net->ipv6.ip6_null_entry;
2547                 goto out;
2548         }
2549
2550         if (rt == net->ipv6.fib6_null_entry) {
2551                 fn = fib6_backtrack(fn, &fl6->saddr);
2552                 if (fn)
2553                         goto restart;
2554         }
2555
2556         res.f6i = rt;
2557         res.nh = &rt->fib6_nh;
2558 out:
2559         if (ret) {
2560                 ip6_hold_safe(net, &ret);
2561         } else {
2562                 res.fib6_flags = res.f6i->fib6_flags;
2563                 res.fib6_type = res.f6i->fib6_type;
2564                 ret = ip6_create_rt_rcu(&res);
2565         }
2566
2567         rcu_read_unlock();
2568
2569         trace_fib6_table_lookup(net, &res, table, fl6);
2570         return ret;
2571 };
2572
2573 static struct dst_entry *ip6_route_redirect(struct net *net,
2574                                             const struct flowi6 *fl6,
2575                                             const struct sk_buff *skb,
2576                                             const struct in6_addr *gateway)
2577 {
2578         int flags = RT6_LOOKUP_F_HAS_SADDR;
2579         struct ip6rd_flowi rdfl;
2580
2581         rdfl.fl6 = *fl6;
2582         rdfl.gateway = *gateway;
2583
2584         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2585                                 flags, __ip6_route_redirect);
2586 }
2587
2588 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2589                   kuid_t uid)
2590 {
2591         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2592         struct dst_entry *dst;
2593         struct flowi6 fl6 = {
2594                 .flowi6_iif = LOOPBACK_IFINDEX,
2595                 .flowi6_oif = oif,
2596                 .flowi6_mark = mark,
2597                 .daddr = iph->daddr,
2598                 .saddr = iph->saddr,
2599                 .flowlabel = ip6_flowinfo(iph),
2600                 .flowi6_uid = uid,
2601         };
2602
2603         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2604         rt6_do_redirect(dst, NULL, skb);
2605         dst_release(dst);
2606 }
2607 EXPORT_SYMBOL_GPL(ip6_redirect);
2608
2609 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2610 {
2611         const struct ipv6hdr *iph = ipv6_hdr(skb);
2612         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2613         struct dst_entry *dst;
2614         struct flowi6 fl6 = {
2615                 .flowi6_iif = LOOPBACK_IFINDEX,
2616                 .flowi6_oif = oif,
2617                 .daddr = msg->dest,
2618                 .saddr = iph->daddr,
2619                 .flowi6_uid = sock_net_uid(net, NULL),
2620         };
2621
2622         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2623         rt6_do_redirect(dst, NULL, skb);
2624         dst_release(dst);
2625 }
2626
2627 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2628 {
2629         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2630                      sk->sk_uid);
2631 }
2632 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2633
2634 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2635 {
2636         struct net_device *dev = dst->dev;
2637         unsigned int mtu = dst_mtu(dst);
2638         struct net *net = dev_net(dev);
2639
2640         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2641
2642         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2643                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2644
2645         /*
2646          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2647          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2648          * IPV6_MAXPLEN is also valid and means: "any MSS,
2649          * rely only on pmtu discovery"
2650          */
2651         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2652                 mtu = IPV6_MAXPLEN;
2653         return mtu;
2654 }
2655
2656 static unsigned int ip6_mtu(const struct dst_entry *dst)
2657 {
2658         struct inet6_dev *idev;
2659         unsigned int mtu;
2660
2661         mtu = dst_metric_raw(dst, RTAX_MTU);
2662         if (mtu)
2663                 goto out;
2664
2665         mtu = IPV6_MIN_MTU;
2666
2667         rcu_read_lock();
2668         idev = __in6_dev_get(dst->dev);
2669         if (idev)
2670                 mtu = idev->cnf.mtu6;
2671         rcu_read_unlock();
2672
2673 out:
2674         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2675
2676         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2677 }
2678
2679 /* MTU selection:
2680  * 1. mtu on route is locked - use it
2681  * 2. mtu from nexthop exception
2682  * 3. mtu from egress device
2683  *
2684  * based on ip6_dst_mtu_forward and exception logic of
2685  * rt6_find_cached_rt; called with rcu_read_lock
2686  */
2687 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2688                       const struct in6_addr *daddr,
2689                       const struct in6_addr *saddr)
2690 {
2691         const struct fib6_nh *nh = res->nh;
2692         struct fib6_info *f6i = res->f6i;
2693         struct inet6_dev *idev;
2694         struct rt6_info *rt;
2695         u32 mtu = 0;
2696
2697         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2698                 mtu = f6i->fib6_pmtu;
2699                 if (mtu)
2700                         goto out;
2701         }
2702
2703         rt = rt6_find_cached_rt(res, daddr, saddr);
2704         if (unlikely(rt)) {
2705                 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2706         } else {
2707                 struct net_device *dev = nh->fib_nh_dev;
2708
2709                 mtu = IPV6_MIN_MTU;
2710                 idev = __in6_dev_get(dev);
2711                 if (idev && idev->cnf.mtu6 > mtu)
2712                         mtu = idev->cnf.mtu6;
2713         }
2714
2715         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2716 out:
2717         return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2718 }
2719
2720 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2721                                   struct flowi6 *fl6)
2722 {
2723         struct dst_entry *dst;
2724         struct rt6_info *rt;
2725         struct inet6_dev *idev = in6_dev_get(dev);
2726         struct net *net = dev_net(dev);
2727
2728         if (unlikely(!idev))
2729                 return ERR_PTR(-ENODEV);
2730
2731         rt = ip6_dst_alloc(net, dev, 0);
2732         if (unlikely(!rt)) {
2733                 in6_dev_put(idev);
2734                 dst = ERR_PTR(-ENOMEM);
2735                 goto out;
2736         }
2737
2738         rt->dst.flags |= DST_HOST;
2739         rt->dst.input = ip6_input;
2740         rt->dst.output  = ip6_output;
2741         rt->rt6i_gateway  = fl6->daddr;
2742         rt->rt6i_dst.addr = fl6->daddr;
2743         rt->rt6i_dst.plen = 128;
2744         rt->rt6i_idev     = idev;
2745         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2746
2747         /* Add this dst into uncached_list so that rt6_disable_ip() can
2748          * do proper release of the net_device
2749          */
2750         rt6_uncached_list_add(rt);
2751         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2752
2753         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2754
2755 out:
2756         return dst;
2757 }
2758
2759 static int ip6_dst_gc(struct dst_ops *ops)
2760 {
2761         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2762         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2763         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2764         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2765         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2766         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2767         int entries;
2768
2769         entries = dst_entries_get_fast(ops);
2770         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2771             entries <= rt_max_size)
2772                 goto out;
2773
2774         net->ipv6.ip6_rt_gc_expire++;
2775         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2776         entries = dst_entries_get_slow(ops);
2777         if (entries < ops->gc_thresh)
2778                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2779 out:
2780         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2781         return entries > rt_max_size;
2782 }
2783
2784 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2785                                             struct fib6_config *cfg,
2786                                             const struct in6_addr *gw_addr,
2787                                             u32 tbid, int flags)
2788 {
2789         struct flowi6 fl6 = {
2790                 .flowi6_oif = cfg->fc_ifindex,
2791                 .daddr = *gw_addr,
2792                 .saddr = cfg->fc_prefsrc,
2793         };
2794         struct fib6_table *table;
2795         struct rt6_info *rt;
2796
2797         table = fib6_get_table(net, tbid);
2798         if (!table)
2799                 return NULL;
2800
2801         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2802                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2803
2804         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2805         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2806
2807         /* if table lookup failed, fall back to full lookup */
2808         if (rt == net->ipv6.ip6_null_entry) {
2809                 ip6_rt_put(rt);
2810                 rt = NULL;
2811         }
2812
2813         return rt;
2814 }
2815
2816 static int ip6_route_check_nh_onlink(struct net *net,
2817                                      struct fib6_config *cfg,
2818                                      const struct net_device *dev,
2819                                      struct netlink_ext_ack *extack)
2820 {
2821         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2822         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2823         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2824         struct fib6_info *from;
2825         struct rt6_info *grt;
2826         int err;
2827
2828         err = 0;
2829         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2830         if (grt) {
2831                 rcu_read_lock();
2832                 from = rcu_dereference(grt->from);
2833                 if (!grt->dst.error &&
2834                     /* ignore match if it is the default route */
2835                     from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2836                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2837                         NL_SET_ERR_MSG(extack,
2838                                        "Nexthop has invalid gateway or device mismatch");
2839                         err = -EINVAL;
2840                 }
2841                 rcu_read_unlock();
2842
2843                 ip6_rt_put(grt);
2844         }
2845
2846         return err;
2847 }
2848
2849 static int ip6_route_check_nh(struct net *net,
2850                               struct fib6_config *cfg,
2851                               struct net_device **_dev,
2852                               struct inet6_dev **idev)
2853 {
2854         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2855         struct net_device *dev = _dev ? *_dev : NULL;
2856         struct rt6_info *grt = NULL;
2857         int err = -EHOSTUNREACH;
2858
2859         if (cfg->fc_table) {
2860                 int flags = RT6_LOOKUP_F_IFACE;
2861
2862                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2863                                           cfg->fc_table, flags);
2864                 if (grt) {
2865                         if (grt->rt6i_flags & RTF_GATEWAY ||
2866                             (dev && dev != grt->dst.dev)) {
2867                                 ip6_rt_put(grt);
2868                                 grt = NULL;
2869                         }
2870                 }
2871         }
2872
2873         if (!grt)
2874                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2875
2876         if (!grt)
2877                 goto out;
2878
2879         if (dev) {
2880                 if (dev != grt->dst.dev) {
2881                         ip6_rt_put(grt);
2882                         goto out;
2883                 }
2884         } else {
2885                 *_dev = dev = grt->dst.dev;
2886                 *idev = grt->rt6i_idev;
2887                 dev_hold(dev);
2888                 in6_dev_hold(grt->rt6i_idev);
2889         }
2890
2891         if (!(grt->rt6i_flags & RTF_GATEWAY))
2892                 err = 0;
2893
2894         ip6_rt_put(grt);
2895
2896 out:
2897         return err;
2898 }
2899
2900 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2901                            struct net_device **_dev, struct inet6_dev **idev,
2902                            struct netlink_ext_ack *extack)
2903 {
2904         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2905         int gwa_type = ipv6_addr_type(gw_addr);
2906         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2907         const struct net_device *dev = *_dev;
2908         bool need_addr_check = !dev;
2909         int err = -EINVAL;
2910
2911         /* if gw_addr is local we will fail to detect this in case
2912          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2913          * will return already-added prefix route via interface that
2914          * prefix route was assigned to, which might be non-loopback.
2915          */
2916         if (dev &&
2917             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2918                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2919                 goto out;
2920         }
2921
2922         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2923                 /* IPv6 strictly inhibits using not link-local
2924                  * addresses as nexthop address.
2925                  * Otherwise, router will not able to send redirects.
2926                  * It is very good, but in some (rare!) circumstances
2927                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2928                  * some exceptions. --ANK
2929                  * We allow IPv4-mapped nexthops to support RFC4798-type
2930                  * addressing
2931                  */
2932                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2933                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2934                         goto out;
2935                 }
2936
2937                 if (cfg->fc_flags & RTNH_F_ONLINK)
2938                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2939                 else
2940                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2941
2942                 if (err)
2943                         goto out;
2944         }
2945
2946         /* reload in case device was changed */
2947         dev = *_dev;
2948
2949         err = -EINVAL;
2950         if (!dev) {
2951                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2952                 goto out;
2953         } else if (dev->flags & IFF_LOOPBACK) {
2954                 NL_SET_ERR_MSG(extack,
2955                                "Egress device can not be loopback device for this route");
2956                 goto out;
2957         }
2958
2959         /* if we did not check gw_addr above, do so now that the
2960          * egress device has been resolved.
2961          */
2962         if (need_addr_check &&
2963             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2964                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2965                 goto out;
2966         }
2967
2968         err = 0;
2969 out:
2970         return err;
2971 }
2972
2973 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2974 {
2975         if ((flags & RTF_REJECT) ||
2976             (dev && (dev->flags & IFF_LOOPBACK) &&
2977              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2978              !(flags & RTF_LOCAL)))
2979                 return true;
2980
2981         return false;
2982 }
2983
2984 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2985                  struct fib6_config *cfg, gfp_t gfp_flags,
2986                  struct netlink_ext_ack *extack)
2987 {
2988         struct net_device *dev = NULL;
2989         struct inet6_dev *idev = NULL;
2990         int addr_type;
2991         int err;
2992
2993         fib6_nh->fib_nh_family = AF_INET6;
2994
2995         err = -ENODEV;
2996         if (cfg->fc_ifindex) {
2997                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2998                 if (!dev)
2999                         goto out;
3000                 idev = in6_dev_get(dev);
3001                 if (!idev)
3002                         goto out;
3003         }
3004
3005         if (cfg->fc_flags & RTNH_F_ONLINK) {
3006                 if (!dev) {
3007                         NL_SET_ERR_MSG(extack,
3008                                        "Nexthop device required for onlink");
3009                         goto out;
3010                 }
3011
3012                 if (!(dev->flags & IFF_UP)) {
3013                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3014                         err = -ENETDOWN;
3015                         goto out;
3016                 }
3017
3018                 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3019         }
3020
3021         fib6_nh->fib_nh_weight = 1;
3022
3023         /* We cannot add true routes via loopback here,
3024          * they would result in kernel looping; promote them to reject routes
3025          */
3026         addr_type = ipv6_addr_type(&cfg->fc_dst);
3027         if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3028                 /* hold loopback dev/idev if we haven't done so. */
3029                 if (dev != net->loopback_dev) {
3030                         if (dev) {
3031                                 dev_put(dev);
3032                                 in6_dev_put(idev);
3033                         }
3034                         dev = net->loopback_dev;
3035                         dev_hold(dev);
3036                         idev = in6_dev_get(dev);
3037                         if (!idev) {
3038                                 err = -ENODEV;
3039                                 goto out;
3040                         }
3041                 }
3042                 goto set_dev;
3043         }
3044
3045         if (cfg->fc_flags & RTF_GATEWAY) {
3046                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3047                 if (err)
3048                         goto out;
3049
3050                 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3051                 fib6_nh->fib_nh_gw_family = AF_INET6;
3052         }
3053
3054         err = -ENODEV;
3055         if (!dev)
3056                 goto out;
3057
3058         if (idev->cnf.disable_ipv6) {
3059                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3060                 err = -EACCES;
3061                 goto out;
3062         }
3063
3064         if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3065                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3066                 err = -ENETDOWN;
3067                 goto out;
3068         }
3069
3070         if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3071             !netif_carrier_ok(dev))
3072                 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3073
3074         err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3075                                  cfg->fc_encap_type, cfg, gfp_flags, extack);
3076         if (err)
3077                 goto out;
3078 set_dev:
3079         fib6_nh->fib_nh_dev = dev;
3080         fib6_nh->fib_nh_oif = dev->ifindex;
3081         err = 0;
3082 out:
3083         if (idev)
3084                 in6_dev_put(idev);
3085
3086         if (err) {
3087                 lwtstate_put(fib6_nh->fib_nh_lws);
3088                 fib6_nh->fib_nh_lws = NULL;
3089                 if (dev)
3090                         dev_put(dev);
3091         }
3092
3093         return err;
3094 }
3095
3096 void fib6_nh_release(struct fib6_nh *fib6_nh)
3097 {
3098         fib_nh_common_release(&fib6_nh->nh_common);
3099 }
3100
3101 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3102                                               gfp_t gfp_flags,
3103                                               struct netlink_ext_ack *extack)
3104 {
3105         struct net *net = cfg->fc_nlinfo.nl_net;
3106         struct fib6_info *rt = NULL;
3107         struct fib6_table *table;
3108         int err = -EINVAL;
3109         int addr_type;
3110
3111         /* RTF_PCPU is an internal flag; can not be set by userspace */
3112         if (cfg->fc_flags & RTF_PCPU) {
3113                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3114                 goto out;
3115         }
3116
3117         /* RTF_CACHE is an internal flag; can not be set by userspace */
3118         if (cfg->fc_flags & RTF_CACHE) {
3119                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3120                 goto out;
3121         }
3122
3123         if (cfg->fc_type > RTN_MAX) {
3124                 NL_SET_ERR_MSG(extack, "Invalid route type");
3125                 goto out;
3126         }
3127
3128         if (cfg->fc_dst_len > 128) {
3129                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3130                 goto out;
3131         }
3132         if (cfg->fc_src_len > 128) {
3133                 NL_SET_ERR_MSG(extack, "Invalid source address length");
3134                 goto out;
3135         }
3136 #ifndef CONFIG_IPV6_SUBTREES
3137         if (cfg->fc_src_len) {
3138                 NL_SET_ERR_MSG(extack,
3139                                "Specifying source address requires IPV6_SUBTREES to be enabled");
3140                 goto out;
3141         }
3142 #endif
3143
3144         err = -ENOBUFS;
3145         if (cfg->fc_nlinfo.nlh &&
3146             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3147                 table = fib6_get_table(net, cfg->fc_table);
3148                 if (!table) {
3149                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3150                         table = fib6_new_table(net, cfg->fc_table);
3151                 }
3152         } else {
3153                 table = fib6_new_table(net, cfg->fc_table);
3154         }
3155
3156         if (!table)
3157                 goto out;
3158
3159         err = -ENOMEM;
3160         rt = fib6_info_alloc(gfp_flags);
3161         if (!rt)
3162                 goto out;
3163
3164         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3165                                                extack);
3166         if (IS_ERR(rt->fib6_metrics)) {
3167                 err = PTR_ERR(rt->fib6_metrics);
3168                 /* Do not leave garbage there. */
3169                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3170                 goto out;
3171         }
3172
3173         if (cfg->fc_flags & RTF_ADDRCONF)
3174                 rt->dst_nocount = true;
3175
3176         if (cfg->fc_flags & RTF_EXPIRES)
3177                 fib6_set_expires(rt, jiffies +
3178                                 clock_t_to_jiffies(cfg->fc_expires));
3179         else
3180                 fib6_clean_expires(rt);
3181
3182         if (cfg->fc_protocol == RTPROT_UNSPEC)
3183                 cfg->fc_protocol = RTPROT_BOOT;
3184         rt->fib6_protocol = cfg->fc_protocol;
3185
3186         rt->fib6_table = table;
3187         rt->fib6_metric = cfg->fc_metric;
3188         rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3189         rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3190
3191         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3192         rt->fib6_dst.plen = cfg->fc_dst_len;
3193         if (rt->fib6_dst.plen == 128)
3194                 rt->dst_host = true;
3195
3196 #ifdef CONFIG_IPV6_SUBTREES
3197         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3198         rt->fib6_src.plen = cfg->fc_src_len;
3199 #endif
3200         err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3201         if (err)
3202                 goto out;
3203
3204         /* We cannot add true routes via loopback here,
3205          * they would result in kernel looping; promote them to reject routes
3206          */
3207         addr_type = ipv6_addr_type(&cfg->fc_dst);
3208         if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3209                 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3210
3211         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3212                 struct net_device *dev = fib6_info_nh_dev(rt);
3213
3214                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3215                         NL_SET_ERR_MSG(extack, "Invalid source address");
3216                         err = -EINVAL;
3217                         goto out;
3218                 }
3219                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3220                 rt->fib6_prefsrc.plen = 128;
3221         } else
3222                 rt->fib6_prefsrc.plen = 0;
3223
3224         return rt;
3225 out:
3226         fib6_info_release(rt);
3227         return ERR_PTR(err);
3228 }
3229
3230 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3231                   struct netlink_ext_ack *extack)
3232 {
3233         struct fib6_info *rt;
3234         int err;
3235
3236         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3237         if (IS_ERR(rt))
3238                 return PTR_ERR(rt);
3239
3240         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3241         fib6_info_release(rt);
3242
3243         return err;
3244 }
3245
3246 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3247 {
3248         struct net *net = info->nl_net;
3249         struct fib6_table *table;
3250         int err;
3251
3252         if (rt == net->ipv6.fib6_null_entry) {
3253                 err = -ENOENT;
3254                 goto out;
3255         }
3256
3257         table = rt->fib6_table;
3258         spin_lock_bh(&table->tb6_lock);
3259         err = fib6_del(rt, info);
3260         spin_unlock_bh(&table->tb6_lock);
3261
3262 out:
3263         fib6_info_release(rt);
3264         return err;
3265 }
3266
3267 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3268 {
3269         struct nl_info info = { .nl_net = net };
3270
3271         return __ip6_del_rt(rt, &info);
3272 }
3273
3274 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3275 {
3276         struct nl_info *info = &cfg->fc_nlinfo;
3277         struct net *net = info->nl_net;
3278         struct sk_buff *skb = NULL;
3279         struct fib6_table *table;
3280         int err = -ENOENT;
3281
3282         if (rt == net->ipv6.fib6_null_entry)
3283                 goto out_put;
3284         table = rt->fib6_table;
3285         spin_lock_bh(&table->tb6_lock);
3286
3287         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3288                 struct fib6_info *sibling, *next_sibling;
3289
3290                 /* prefer to send a single notification with all hops */
3291                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3292                 if (skb) {
3293                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3294
3295                         if (rt6_fill_node(net, skb, rt, NULL,
3296                                           NULL, NULL, 0, RTM_DELROUTE,
3297                                           info->portid, seq, 0) < 0) {
3298                                 kfree_skb(skb);
3299                                 skb = NULL;
3300                         } else
3301                                 info->skip_notify = 1;
3302                 }
3303
3304                 list_for_each_entry_safe(sibling, next_sibling,
3305                                          &rt->fib6_siblings,
3306                                          fib6_siblings) {
3307                         err = fib6_del(sibling, info);
3308                         if (err)
3309                                 goto out_unlock;
3310                 }
3311         }
3312
3313         err = fib6_del(rt, info);
3314 out_unlock:
3315         spin_unlock_bh(&table->tb6_lock);
3316 out_put:
3317         fib6_info_release(rt);
3318
3319         if (skb) {
3320                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3321                             info->nlh, gfp_any());
3322         }
3323         return err;
3324 }
3325
3326 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3327 {
3328         int rc = -ESRCH;
3329
3330         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3331                 goto out;
3332
3333         if (cfg->fc_flags & RTF_GATEWAY &&
3334             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3335                 goto out;
3336
3337         rc = rt6_remove_exception_rt(rt);
3338 out:
3339         return rc;
3340 }
3341
3342 static int ip6_route_del(struct fib6_config *cfg,
3343                          struct netlink_ext_ack *extack)
3344 {
3345         struct rt6_info *rt_cache;
3346         struct fib6_table *table;
3347         struct fib6_info *rt;
3348         struct fib6_node *fn;
3349         int err = -ESRCH;
3350
3351         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3352         if (!table) {
3353                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3354                 return err;
3355         }
3356
3357         rcu_read_lock();
3358
3359         fn = fib6_locate(&table->tb6_root,
3360                          &cfg->fc_dst, cfg->fc_dst_len,
3361                          &cfg->fc_src, cfg->fc_src_len,
3362                          !(cfg->fc_flags & RTF_CACHE));
3363
3364         if (fn) {
3365                 for_each_fib6_node_rt_rcu(fn) {
3366                         struct fib6_nh *nh;
3367
3368                         if (cfg->fc_flags & RTF_CACHE) {
3369                                 struct fib6_result res = {
3370                                         .f6i = rt,
3371                                 };
3372                                 int rc;
3373
3374                                 rt_cache = rt6_find_cached_rt(&res,
3375                                                               &cfg->fc_dst,
3376                                                               &cfg->fc_src);
3377                                 if (rt_cache) {
3378                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3379                                         if (rc != -ESRCH) {
3380                                                 rcu_read_unlock();
3381                                                 return rc;
3382                                         }
3383                                 }
3384                                 continue;
3385                         }
3386
3387                         nh = &rt->fib6_nh;
3388                         if (cfg->fc_ifindex &&
3389                             (!nh->fib_nh_dev ||
3390                              nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3391                                 continue;
3392                         if (cfg->fc_flags & RTF_GATEWAY &&
3393                             !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3394                                 continue;
3395                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3396                                 continue;
3397                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3398                                 continue;
3399                         if (!fib6_info_hold_safe(rt))
3400                                 continue;
3401                         rcu_read_unlock();
3402
3403                         /* if gateway was specified only delete the one hop */
3404                         if (cfg->fc_flags & RTF_GATEWAY)
3405                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3406
3407                         return __ip6_del_rt_siblings(rt, cfg);
3408                 }
3409         }
3410         rcu_read_unlock();
3411
3412         return err;
3413 }
3414
3415 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3416 {
3417         struct netevent_redirect netevent;
3418         struct rt6_info *rt, *nrt = NULL;
3419         struct fib6_result res = {};
3420         struct ndisc_options ndopts;
3421         struct inet6_dev *in6_dev;
3422         struct neighbour *neigh;
3423         struct rd_msg *msg;
3424         int optlen, on_link;
3425         u8 *lladdr;
3426
3427         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3428         optlen -= sizeof(*msg);
3429
3430         if (optlen < 0) {
3431                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3432                 return;
3433         }
3434
3435         msg = (struct rd_msg *)icmp6_hdr(skb);
3436
3437         if (ipv6_addr_is_multicast(&msg->dest)) {
3438                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3439                 return;
3440         }
3441
3442         on_link = 0;
3443         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3444                 on_link = 1;
3445         } else if (ipv6_addr_type(&msg->target) !=
3446                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3447                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3448                 return;
3449         }
3450
3451         in6_dev = __in6_dev_get(skb->dev);
3452         if (!in6_dev)
3453                 return;
3454         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3455                 return;
3456
3457         /* RFC2461 8.1:
3458          *      The IP source address of the Redirect MUST be the same as the current
3459          *      first-hop router for the specified ICMP Destination Address.
3460          */
3461
3462         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3463                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3464                 return;
3465         }
3466
3467         lladdr = NULL;
3468         if (ndopts.nd_opts_tgt_lladdr) {
3469                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3470                                              skb->dev);
3471                 if (!lladdr) {
3472                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3473                         return;
3474                 }
3475         }
3476
3477         rt = (struct rt6_info *) dst;
3478         if (rt->rt6i_flags & RTF_REJECT) {
3479                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3480                 return;
3481         }
3482
3483         /* Redirect received -> path was valid.
3484          * Look, redirects are sent only in response to data packets,
3485          * so that this nexthop apparently is reachable. --ANK
3486          */
3487         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3488
3489         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3490         if (!neigh)
3491                 return;
3492
3493         /*
3494          *      We have finally decided to accept it.
3495          */
3496
3497         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3498                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3499                      NEIGH_UPDATE_F_OVERRIDE|
3500                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3501                                      NEIGH_UPDATE_F_ISROUTER)),
3502                      NDISC_REDIRECT, &ndopts);
3503
3504         rcu_read_lock();
3505         res.f6i = rcu_dereference(rt->from);
3506         if (!res.f6i)
3507                 goto out;
3508
3509         res.nh = &res.f6i->fib6_nh;
3510         res.fib6_flags = res.f6i->fib6_flags;
3511         res.fib6_type = res.f6i->fib6_type;
3512         nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3513         if (!nrt)
3514                 goto out;
3515
3516         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3517         if (on_link)
3518                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3519
3520         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3521
3522         /* rt6_insert_exception() will take care of duplicated exceptions */
3523         if (rt6_insert_exception(nrt, &res)) {
3524                 dst_release_immediate(&nrt->dst);
3525                 goto out;
3526         }
3527
3528         netevent.old = &rt->dst;
3529         netevent.new = &nrt->dst;
3530         netevent.daddr = &msg->dest;
3531         netevent.neigh = neigh;
3532         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3533
3534 out:
3535         rcu_read_unlock();
3536         neigh_release(neigh);
3537 }
3538
3539 #ifdef CONFIG_IPV6_ROUTE_INFO
3540 static struct fib6_info *rt6_get_route_info(struct net *net,
3541                                            const struct in6_addr *prefix, int prefixlen,
3542                                            const struct in6_addr *gwaddr,
3543                                            struct net_device *dev)
3544 {
3545         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3546         int ifindex = dev->ifindex;
3547         struct fib6_node *fn;
3548         struct fib6_info *rt = NULL;
3549         struct fib6_table *table;
3550
3551         table = fib6_get_table(net, tb_id);
3552         if (!table)
3553                 return NULL;
3554
3555         rcu_read_lock();
3556         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3557         if (!fn)
3558                 goto out;
3559
3560         for_each_fib6_node_rt_rcu(fn) {
3561                 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3562                         continue;
3563                 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3564                     !rt->fib6_nh.fib_nh_gw_family)
3565                         continue;
3566                 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3567                         continue;
3568                 if (!fib6_info_hold_safe(rt))
3569                         continue;
3570                 break;
3571         }
3572 out:
3573         rcu_read_unlock();
3574         return rt;
3575 }
3576
3577 static struct fib6_info *rt6_add_route_info(struct net *net,
3578                                            const struct in6_addr *prefix, int prefixlen,
3579                                            const struct in6_addr *gwaddr,
3580                                            struct net_device *dev,
3581                                            unsigned int pref)
3582 {
3583         struct fib6_config cfg = {
3584                 .fc_metric      = IP6_RT_PRIO_USER,
3585                 .fc_ifindex     = dev->ifindex,
3586                 .fc_dst_len     = prefixlen,
3587                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3588                                   RTF_UP | RTF_PREF(pref),
3589                 .fc_protocol = RTPROT_RA,
3590                 .fc_type = RTN_UNICAST,
3591                 .fc_nlinfo.portid = 0,
3592                 .fc_nlinfo.nlh = NULL,
3593                 .fc_nlinfo.nl_net = net,
3594         };
3595
3596         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3597         cfg.fc_dst = *prefix;
3598         cfg.fc_gateway = *gwaddr;
3599
3600         /* We should treat it as a default route if prefix length is 0. */
3601         if (!prefixlen)
3602                 cfg.fc_flags |= RTF_DEFAULT;
3603
3604         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3605
3606         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3607 }
3608 #endif
3609
3610 struct fib6_info *rt6_get_dflt_router(struct net *net,
3611                                      const struct in6_addr *addr,
3612                                      struct net_device *dev)
3613 {
3614         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3615         struct fib6_info *rt;
3616         struct fib6_table *table;
3617
3618         table = fib6_get_table(net, tb_id);
3619         if (!table)
3620                 return NULL;
3621
3622         rcu_read_lock();
3623         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3624                 struct fib6_nh *nh = &rt->fib6_nh;
3625
3626                 if (dev == nh->fib_nh_dev &&
3627                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3628                     ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3629                         break;
3630         }
3631         if (rt && !fib6_info_hold_safe(rt))
3632                 rt = NULL;
3633         rcu_read_unlock();
3634         return rt;
3635 }
3636
3637 struct fib6_info *rt6_add_dflt_router(struct net *net,
3638                                      const struct in6_addr *gwaddr,
3639                                      struct net_device *dev,
3640                                      unsigned int pref)
3641 {
3642         struct fib6_config cfg = {
3643                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3644                 .fc_metric      = IP6_RT_PRIO_USER,
3645                 .fc_ifindex     = dev->ifindex,
3646                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3647                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3648                 .fc_protocol = RTPROT_RA,
3649                 .fc_type = RTN_UNICAST,
3650                 .fc_nlinfo.portid = 0,
3651                 .fc_nlinfo.nlh = NULL,
3652                 .fc_nlinfo.nl_net = net,
3653         };
3654
3655         cfg.fc_gateway = *gwaddr;
3656
3657         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3658                 struct fib6_table *table;
3659
3660                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3661                 if (table)
3662                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3663         }
3664
3665         return rt6_get_dflt_router(net, gwaddr, dev);
3666 }
3667
3668 static void __rt6_purge_dflt_routers(struct net *net,
3669                                      struct fib6_table *table)
3670 {
3671         struct fib6_info *rt;
3672
3673 restart:
3674         rcu_read_lock();
3675         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3676                 struct net_device *dev = fib6_info_nh_dev(rt);
3677                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3678
3679                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3680                     (!idev || idev->cnf.accept_ra != 2) &&
3681                     fib6_info_hold_safe(rt)) {
3682                         rcu_read_unlock();
3683                         ip6_del_rt(net, rt);
3684                         goto restart;
3685                 }
3686         }
3687         rcu_read_unlock();
3688
3689         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3690 }
3691
3692 void rt6_purge_dflt_routers(struct net *net)
3693 {
3694         struct fib6_table *table;
3695         struct hlist_head *head;
3696         unsigned int h;
3697
3698         rcu_read_lock();
3699
3700         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3701                 head = &net->ipv6.fib_table_hash[h];
3702                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3703                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3704                                 __rt6_purge_dflt_routers(net, table);
3705                 }
3706         }
3707
3708         rcu_read_unlock();
3709 }
3710
3711 static void rtmsg_to_fib6_config(struct net *net,
3712                                  struct in6_rtmsg *rtmsg,
3713                                  struct fib6_config *cfg)
3714 {
3715         *cfg = (struct fib6_config){
3716                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3717                          : RT6_TABLE_MAIN,
3718                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3719                 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3720                 .fc_expires = rtmsg->rtmsg_info,
3721                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3722                 .fc_src_len = rtmsg->rtmsg_src_len,
3723                 .fc_flags = rtmsg->rtmsg_flags,
3724                 .fc_type = rtmsg->rtmsg_type,
3725
3726                 .fc_nlinfo.nl_net = net,
3727
3728                 .fc_dst = rtmsg->rtmsg_dst,
3729                 .fc_src = rtmsg->rtmsg_src,
3730                 .fc_gateway = rtmsg->rtmsg_gateway,
3731         };
3732 }
3733
3734 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3735 {
3736         struct fib6_config cfg;
3737         struct in6_rtmsg rtmsg;
3738         int err;
3739
3740         switch (cmd) {
3741         case SIOCADDRT:         /* Add a route */
3742         case SIOCDELRT:         /* Delete a route */
3743                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3744                         return -EPERM;
3745                 err = copy_from_user(&rtmsg, arg,
3746                                      sizeof(struct in6_rtmsg));
3747                 if (err)
3748                         return -EFAULT;
3749
3750                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3751
3752                 rtnl_lock();
3753                 switch (cmd) {
3754                 case SIOCADDRT:
3755                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3756                         break;
3757                 case SIOCDELRT:
3758                         err = ip6_route_del(&cfg, NULL);
3759                         break;
3760                 default:
3761                         err = -EINVAL;
3762                 }
3763                 rtnl_unlock();
3764
3765                 return err;
3766         }
3767
3768         return -EINVAL;
3769 }
3770
3771 /*
3772  *      Drop the packet on the floor
3773  */
3774
3775 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3776 {
3777         struct dst_entry *dst = skb_dst(skb);
3778         struct net *net = dev_net(dst->dev);
3779         struct inet6_dev *idev;
3780         int type;
3781
3782         if (netif_is_l3_master(skb->dev) &&
3783             dst->dev == net->loopback_dev)
3784                 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3785         else
3786                 idev = ip6_dst_idev(dst);
3787
3788         switch (ipstats_mib_noroutes) {
3789         case IPSTATS_MIB_INNOROUTES:
3790                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3791                 if (type == IPV6_ADDR_ANY) {
3792                         IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3793                         break;
3794                 }
3795                 /* FALLTHROUGH */
3796         case IPSTATS_MIB_OUTNOROUTES:
3797                 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3798                 break;
3799         }
3800
3801         /* Start over by dropping the dst for l3mdev case */
3802         if (netif_is_l3_master(skb->dev))
3803                 skb_dst_drop(skb);
3804
3805         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3806         kfree_skb(skb);
3807         return 0;
3808 }
3809
3810 static int ip6_pkt_discard(struct sk_buff *skb)
3811 {
3812         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3813 }
3814
3815 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3816 {
3817         skb->dev = skb_dst(skb)->dev;
3818         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3819 }
3820
3821 static int ip6_pkt_prohibit(struct sk_buff *skb)
3822 {
3823         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3824 }
3825
3826 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3827 {
3828         skb->dev = skb_dst(skb)->dev;
3829         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3830 }
3831
3832 /*
3833  *      Allocate a dst for local (unicast / anycast) address.
3834  */
3835
3836 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3837                                      struct inet6_dev *idev,
3838                                      const struct in6_addr *addr,
3839                                      bool anycast, gfp_t gfp_flags)
3840 {
3841         struct fib6_config cfg = {
3842                 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3843                 .fc_ifindex = idev->dev->ifindex,
3844                 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3845                 .fc_dst = *addr,
3846                 .fc_dst_len = 128,
3847                 .fc_protocol = RTPROT_KERNEL,
3848                 .fc_nlinfo.nl_net = net,
3849                 .fc_ignore_dev_down = true,
3850         };
3851
3852         if (anycast) {
3853                 cfg.fc_type = RTN_ANYCAST;
3854                 cfg.fc_flags |= RTF_ANYCAST;
3855         } else {
3856                 cfg.fc_type = RTN_LOCAL;
3857                 cfg.fc_flags |= RTF_LOCAL;
3858         }
3859
3860         return ip6_route_info_create(&cfg, gfp_flags, NULL);
3861 }
3862
3863 /* remove deleted ip from prefsrc entries */
3864 struct arg_dev_net_ip {
3865         struct net_device *dev;
3866         struct net *net;
3867         struct in6_addr *addr;
3868 };
3869
3870 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3871 {
3872         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3873         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3874         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3875
3876         if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3877             rt != net->ipv6.fib6_null_entry &&
3878             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3879                 spin_lock_bh(&rt6_exception_lock);
3880                 /* remove prefsrc entry */
3881                 rt->fib6_prefsrc.plen = 0;
3882                 spin_unlock_bh(&rt6_exception_lock);
3883         }
3884         return 0;
3885 }
3886
3887 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3888 {
3889         struct net *net = dev_net(ifp->idev->dev);
3890         struct arg_dev_net_ip adni = {
3891                 .dev = ifp->idev->dev,
3892                 .net = net,
3893                 .addr = &ifp->addr,
3894         };
3895         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3896 }
3897
3898 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT)
3899
3900 /* Remove routers and update dst entries when gateway turn into host. */
3901 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3902 {
3903         struct in6_addr *gateway = (struct in6_addr *)arg;
3904
3905         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3906             rt->fib6_nh.fib_nh_gw_family &&
3907             ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3908                 return -1;
3909         }
3910
3911         /* Further clean up cached routes in exception table.
3912          * This is needed because cached route may have a different
3913          * gateway than its 'parent' in the case of an ip redirect.
3914          */
3915         rt6_exceptions_clean_tohost(rt, gateway);
3916
3917         return 0;
3918 }
3919
3920 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3921 {
3922         fib6_clean_all(net, fib6_clean_tohost, gateway);
3923 }
3924
3925 struct arg_netdev_event {
3926         const struct net_device *dev;
3927         union {
3928                 unsigned char nh_flags;
3929                 unsigned long event;
3930         };
3931 };
3932
3933 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3934 {
3935         struct fib6_info *iter;
3936         struct fib6_node *fn;
3937
3938         fn = rcu_dereference_protected(rt->fib6_node,
3939                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3940         iter = rcu_dereference_protected(fn->leaf,
3941                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3942         while (iter) {
3943                 if (iter->fib6_metric == rt->fib6_metric &&
3944                     rt6_qualify_for_ecmp(iter))
3945                         return iter;
3946                 iter = rcu_dereference_protected(iter->fib6_next,
3947                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3948         }
3949
3950         return NULL;
3951 }
3952
3953 static bool rt6_is_dead(const struct fib6_info *rt)
3954 {
3955         if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3956             (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3957              ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3958                 return true;
3959
3960         return false;
3961 }
3962
3963 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3964 {
3965         struct fib6_info *iter;
3966         int total = 0;
3967
3968         if (!rt6_is_dead(rt))
3969                 total += rt->fib6_nh.fib_nh_weight;
3970
3971         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3972                 if (!rt6_is_dead(iter))
3973                         total += iter->fib6_nh.fib_nh_weight;
3974         }
3975
3976         return total;
3977 }
3978
3979 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3980 {
3981         int upper_bound = -1;
3982
3983         if (!rt6_is_dead(rt)) {
3984                 *weight += rt->fib6_nh.fib_nh_weight;
3985                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3986                                                     total) - 1;
3987         }
3988         atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3989 }
3990
3991 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3992 {
3993         struct fib6_info *iter;
3994         int weight = 0;
3995
3996         rt6_upper_bound_set(rt, &weight, total);
3997
3998         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3999                 rt6_upper_bound_set(iter, &weight, total);
4000 }
4001
4002 void rt6_multipath_rebalance(struct fib6_info *rt)
4003 {
4004         struct fib6_info *first;
4005         int total;
4006
4007         /* In case the entire multipath route was marked for flushing,
4008          * then there is no need to rebalance upon the removal of every
4009          * sibling route.
4010          */
4011         if (!rt->fib6_nsiblings || rt->should_flush)
4012                 return;
4013
4014         /* During lookup routes are evaluated in order, so we need to
4015          * make sure upper bounds are assigned from the first sibling
4016          * onwards.
4017          */
4018         first = rt6_multipath_first_sibling(rt);
4019         if (WARN_ON_ONCE(!first))
4020                 return;
4021
4022         total = rt6_multipath_total_weight(first);
4023         rt6_multipath_upper_bound_set(first, total);
4024 }
4025
4026 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4027 {
4028         const struct arg_netdev_event *arg = p_arg;
4029         struct net *net = dev_net(arg->dev);
4030
4031         if (rt != net->ipv6.fib6_null_entry &&
4032             rt->fib6_nh.fib_nh_dev == arg->dev) {
4033                 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4034                 fib6_update_sernum_upto_root(net, rt);
4035                 rt6_multipath_rebalance(rt);
4036         }
4037
4038         return 0;
4039 }
4040
4041 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4042 {
4043         struct arg_netdev_event arg = {
4044                 .dev = dev,
4045                 {
4046                         .nh_flags = nh_flags,
4047                 },
4048         };
4049
4050         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4051                 arg.nh_flags |= RTNH_F_LINKDOWN;
4052
4053         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4054 }
4055
4056 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4057                                    const struct net_device *dev)
4058 {
4059         struct fib6_info *iter;
4060
4061         if (rt->fib6_nh.fib_nh_dev == dev)
4062                 return true;
4063         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4064                 if (iter->fib6_nh.fib_nh_dev == dev)
4065                         return true;
4066
4067         return false;
4068 }
4069
4070 static void rt6_multipath_flush(struct fib6_info *rt)
4071 {
4072         struct fib6_info *iter;
4073
4074         rt->should_flush = 1;
4075         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4076                 iter->should_flush = 1;
4077 }
4078
4079 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4080                                              const struct net_device *down_dev)
4081 {
4082         struct fib6_info *iter;
4083         unsigned int dead = 0;
4084
4085         if (rt->fib6_nh.fib_nh_dev == down_dev ||
4086             rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4087                 dead++;
4088         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4089                 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4090                     iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4091                         dead++;
4092
4093         return dead;
4094 }
4095
4096 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4097                                        const struct net_device *dev,
4098                                        unsigned char nh_flags)
4099 {
4100         struct fib6_info *iter;
4101
4102         if (rt->fib6_nh.fib_nh_dev == dev)
4103                 rt->fib6_nh.fib_nh_flags |= nh_flags;
4104         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4105                 if (iter->fib6_nh.fib_nh_dev == dev)
4106                         iter->fib6_nh.fib_nh_flags |= nh_flags;
4107 }
4108
4109 /* called with write lock held for table with rt */
4110 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4111 {
4112         const struct arg_netdev_event *arg = p_arg;
4113         const struct net_device *dev = arg->dev;
4114         struct net *net = dev_net(dev);
4115
4116         if (rt == net->ipv6.fib6_null_entry)
4117                 return 0;
4118
4119         switch (arg->event) {
4120         case NETDEV_UNREGISTER:
4121                 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4122         case NETDEV_DOWN:
4123                 if (rt->should_flush)
4124                         return -1;
4125                 if (!rt->fib6_nsiblings)
4126                         return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4127                 if (rt6_multipath_uses_dev(rt, dev)) {
4128                         unsigned int count;
4129
4130                         count = rt6_multipath_dead_count(rt, dev);
4131                         if (rt->fib6_nsiblings + 1 == count) {
4132                                 rt6_multipath_flush(rt);
4133                                 return -1;
4134                         }
4135                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4136                                                    RTNH_F_LINKDOWN);
4137                         fib6_update_sernum(net, rt);
4138                         rt6_multipath_rebalance(rt);
4139                 }
4140                 return -2;
4141         case NETDEV_CHANGE:
4142                 if (rt->fib6_nh.fib_nh_dev != dev ||
4143                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4144                         break;
4145                 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4146                 rt6_multipath_rebalance(rt);
4147                 break;
4148         }
4149
4150         return 0;
4151 }
4152
4153 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4154 {
4155         struct arg_netdev_event arg = {
4156                 .dev = dev,
4157                 {
4158                         .event = event,
4159                 },
4160         };
4161         struct net *net = dev_net(dev);
4162
4163         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4164                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4165         else
4166                 fib6_clean_all(net, fib6_ifdown, &arg);
4167 }
4168
4169 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4170 {
4171         rt6_sync_down_dev(dev, event);
4172         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4173         neigh_ifdown(&nd_tbl, dev);
4174 }
4175
4176 struct rt6_mtu_change_arg {
4177         struct net_device *dev;
4178         unsigned int mtu;
4179 };
4180
4181 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4182 {
4183         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4184         struct inet6_dev *idev;
4185
4186         /* In IPv6 pmtu discovery is not optional,
4187            so that RTAX_MTU lock cannot disable it.
4188            We still use this lock to block changes
4189            caused by addrconf/ndisc.
4190         */
4191
4192         idev = __in6_dev_get(arg->dev);
4193         if (!idev)
4194                 return 0;
4195
4196         /* For administrative MTU increase, there is no way to discover
4197            IPv6 PMTU increase, so PMTU increase should be updated here.
4198            Since RFC 1981 doesn't include administrative MTU increase
4199            update PMTU increase is a MUST. (i.e. jumbo frame)
4200          */
4201         if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4202             !fib6_metric_locked(rt, RTAX_MTU)) {
4203                 u32 mtu = rt->fib6_pmtu;
4204
4205                 if (mtu >= arg->mtu ||
4206                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4207                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4208
4209                 spin_lock_bh(&rt6_exception_lock);
4210                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4211                 spin_unlock_bh(&rt6_exception_lock);
4212         }
4213         return 0;
4214 }
4215
4216 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4217 {
4218         struct rt6_mtu_change_arg arg = {
4219                 .dev = dev,
4220                 .mtu = mtu,
4221         };
4222
4223         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4224 }
4225
4226 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4227         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4228         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4229         [RTA_OIF]               = { .type = NLA_U32 },
4230         [RTA_IIF]               = { .type = NLA_U32 },
4231         [RTA_PRIORITY]          = { .type = NLA_U32 },
4232         [RTA_METRICS]           = { .type = NLA_NESTED },
4233         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4234         [RTA_PREF]              = { .type = NLA_U8 },
4235         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4236         [RTA_ENCAP]             = { .type = NLA_NESTED },
4237         [RTA_EXPIRES]           = { .type = NLA_U32 },
4238         [RTA_UID]               = { .type = NLA_U32 },
4239         [RTA_MARK]              = { .type = NLA_U32 },
4240         [RTA_TABLE]             = { .type = NLA_U32 },
4241         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4242         [RTA_SPORT]             = { .type = NLA_U16 },
4243         [RTA_DPORT]             = { .type = NLA_U16 },
4244 };
4245
4246 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4247                               struct fib6_config *cfg,
4248                               struct netlink_ext_ack *extack)
4249 {
4250         struct rtmsg *rtm;
4251         struct nlattr *tb[RTA_MAX+1];
4252         unsigned int pref;
4253         int err;
4254
4255         err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4256                                      rtm_ipv6_policy, extack);
4257         if (err < 0)
4258                 goto errout;
4259
4260         err = -EINVAL;
4261         rtm = nlmsg_data(nlh);
4262
4263         *cfg = (struct fib6_config){
4264                 .fc_table = rtm->rtm_table,
4265                 .fc_dst_len = rtm->rtm_dst_len,
4266                 .fc_src_len = rtm->rtm_src_len,
4267                 .fc_flags = RTF_UP,
4268                 .fc_protocol = rtm->rtm_protocol,
4269                 .fc_type = rtm->rtm_type,
4270
4271                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4272                 .fc_nlinfo.nlh = nlh,
4273                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4274         };
4275
4276         if (rtm->rtm_type == RTN_UNREACHABLE ||
4277             rtm->rtm_type == RTN_BLACKHOLE ||
4278             rtm->rtm_type == RTN_PROHIBIT ||
4279             rtm->rtm_type == RTN_THROW)
4280                 cfg->fc_flags |= RTF_REJECT;
4281
4282         if (rtm->rtm_type == RTN_LOCAL)
4283                 cfg->fc_flags |= RTF_LOCAL;
4284
4285         if (rtm->rtm_flags & RTM_F_CLONED)
4286                 cfg->fc_flags |= RTF_CACHE;
4287
4288         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4289
4290         if (tb[RTA_GATEWAY]) {
4291                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4292                 cfg->fc_flags |= RTF_GATEWAY;
4293         }
4294         if (tb[RTA_VIA]) {
4295                 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4296                 goto errout;
4297         }
4298
4299         if (tb[RTA_DST]) {
4300                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4301
4302                 if (nla_len(tb[RTA_DST]) < plen)
4303                         goto errout;
4304
4305                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4306         }
4307
4308         if (tb[RTA_SRC]) {
4309                 int plen = (rtm->rtm_src_len + 7) >> 3;
4310
4311                 if (nla_len(tb[RTA_SRC]) < plen)
4312                         goto errout;
4313
4314                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4315         }
4316
4317         if (tb[RTA_PREFSRC])
4318                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4319
4320         if (tb[RTA_OIF])
4321                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4322
4323         if (tb[RTA_PRIORITY])
4324                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4325
4326         if (tb[RTA_METRICS]) {
4327                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4328                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4329         }
4330
4331         if (tb[RTA_TABLE])
4332                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4333
4334         if (tb[RTA_MULTIPATH]) {
4335                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4336                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4337
4338                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4339                                                      cfg->fc_mp_len, extack);
4340                 if (err < 0)
4341                         goto errout;
4342         }
4343
4344         if (tb[RTA_PREF]) {
4345                 pref = nla_get_u8(tb[RTA_PREF]);
4346                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4347                     pref != ICMPV6_ROUTER_PREF_HIGH)
4348                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4349                 cfg->fc_flags |= RTF_PREF(pref);
4350         }
4351
4352         if (tb[RTA_ENCAP])
4353                 cfg->fc_encap = tb[RTA_ENCAP];
4354
4355         if (tb[RTA_ENCAP_TYPE]) {
4356                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4357
4358                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4359                 if (err < 0)
4360                         goto errout;
4361         }
4362
4363         if (tb[RTA_EXPIRES]) {
4364                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4365
4366                 if (addrconf_finite_timeout(timeout)) {
4367                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4368                         cfg->fc_flags |= RTF_EXPIRES;
4369                 }
4370         }
4371
4372         err = 0;
4373 errout:
4374         return err;
4375 }
4376
4377 struct rt6_nh {
4378         struct fib6_info *fib6_info;
4379         struct fib6_config r_cfg;
4380         struct list_head next;
4381 };
4382
4383 static int ip6_route_info_append(struct net *net,
4384                                  struct list_head *rt6_nh_list,
4385                                  struct fib6_info *rt,
4386                                  struct fib6_config *r_cfg)
4387 {
4388         struct rt6_nh *nh;
4389         int err = -EEXIST;
4390
4391         list_for_each_entry(nh, rt6_nh_list, next) {
4392                 /* check if fib6_info already exists */
4393                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4394                         return err;
4395         }
4396
4397         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4398         if (!nh)
4399                 return -ENOMEM;
4400         nh->fib6_info = rt;
4401         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4402         list_add_tail(&nh->next, rt6_nh_list);
4403
4404         return 0;
4405 }
4406
4407 static void ip6_route_mpath_notify(struct fib6_info *rt,
4408                                    struct fib6_info *rt_last,
4409                                    struct nl_info *info,
4410                                    __u16 nlflags)
4411 {
4412         /* if this is an APPEND route, then rt points to the first route
4413          * inserted and rt_last points to last route inserted. Userspace
4414          * wants a consistent dump of the route which starts at the first
4415          * nexthop. Since sibling routes are always added at the end of
4416          * the list, find the first sibling of the last route appended
4417          */
4418         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4419                 rt = list_first_entry(&rt_last->fib6_siblings,
4420                                       struct fib6_info,
4421                                       fib6_siblings);
4422         }
4423
4424         if (rt)
4425                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4426 }
4427
4428 static int ip6_route_multipath_add(struct fib6_config *cfg,
4429                                    struct netlink_ext_ack *extack)
4430 {
4431         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4432         struct nl_info *info = &cfg->fc_nlinfo;
4433         struct fib6_config r_cfg;
4434         struct rtnexthop *rtnh;
4435         struct fib6_info *rt;
4436         struct rt6_nh *err_nh;
4437         struct rt6_nh *nh, *nh_safe;
4438         __u16 nlflags;
4439         int remaining;
4440         int attrlen;
4441         int err = 1;
4442         int nhn = 0;
4443         int replace = (cfg->fc_nlinfo.nlh &&
4444                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4445         LIST_HEAD(rt6_nh_list);
4446
4447         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4448         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4449                 nlflags |= NLM_F_APPEND;
4450
4451         remaining = cfg->fc_mp_len;
4452         rtnh = (struct rtnexthop *)cfg->fc_mp;
4453
4454         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4455          * fib6_info structs per nexthop
4456          */
4457         while (rtnh_ok(rtnh, remaining)) {
4458                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4459                 if (rtnh->rtnh_ifindex)
4460                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4461
4462                 attrlen = rtnh_attrlen(rtnh);
4463                 if (attrlen > 0) {
4464                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4465
4466                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4467                         if (nla) {
4468                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4469                                 r_cfg.fc_flags |= RTF_GATEWAY;
4470                         }
4471                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4472                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4473                         if (nla)
4474                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4475                 }
4476
4477                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4478                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4479                 if (IS_ERR(rt)) {
4480                         err = PTR_ERR(rt);
4481                         rt = NULL;
4482                         goto cleanup;
4483                 }
4484                 if (!rt6_qualify_for_ecmp(rt)) {
4485                         err = -EINVAL;
4486                         NL_SET_ERR_MSG(extack,
4487                                        "Device only routes can not be added for IPv6 using the multipath API.");
4488                         fib6_info_release(rt);
4489                         goto cleanup;
4490                 }
4491
4492                 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4493
4494                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4495                                             rt, &r_cfg);
4496                 if (err) {
4497                         fib6_info_release(rt);
4498                         goto cleanup;
4499                 }
4500
4501                 rtnh = rtnh_next(rtnh, &remaining);
4502         }
4503
4504         /* for add and replace send one notification with all nexthops.
4505          * Skip the notification in fib6_add_rt2node and send one with
4506          * the full route when done
4507          */
4508         info->skip_notify = 1;
4509
4510         err_nh = NULL;
4511         list_for_each_entry(nh, &rt6_nh_list, next) {
4512                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4513                 fib6_info_release(nh->fib6_info);
4514
4515                 if (!err) {
4516                         /* save reference to last route successfully inserted */
4517                         rt_last = nh->fib6_info;
4518
4519                         /* save reference to first route for notification */
4520                         if (!rt_notif)
4521                                 rt_notif = nh->fib6_info;
4522                 }
4523
4524                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4525                 nh->fib6_info = NULL;
4526                 if (err) {
4527                         if (replace && nhn)
4528                                 NL_SET_ERR_MSG_MOD(extack,
4529                                                    "multipath route replace failed (check consistency of installed routes)");
4530                         err_nh = nh;
4531                         goto add_errout;
4532                 }
4533
4534                 /* Because each route is added like a single route we remove
4535                  * these flags after the first nexthop: if there is a collision,
4536                  * we have already failed to add the first nexthop:
4537                  * fib6_add_rt2node() has rejected it; when replacing, old
4538                  * nexthops have been replaced by first new, the rest should
4539                  * be added to it.
4540                  */
4541                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4542                                                      NLM_F_REPLACE);
4543                 nhn++;
4544         }
4545
4546         /* success ... tell user about new route */
4547         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4548         goto cleanup;
4549
4550 add_errout:
4551         /* send notification for routes that were added so that
4552          * the delete notifications sent by ip6_route_del are
4553          * coherent
4554          */
4555         if (rt_notif)
4556                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4557
4558         /* Delete routes that were already added */
4559         list_for_each_entry(nh, &rt6_nh_list, next) {
4560                 if (err_nh == nh)
4561                         break;
4562                 ip6_route_del(&nh->r_cfg, extack);
4563         }
4564
4565 cleanup:
4566         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4567                 if (nh->fib6_info)
4568                         fib6_info_release(nh->fib6_info);
4569                 list_del(&nh->next);
4570                 kfree(nh);
4571         }
4572
4573         return err;
4574 }
4575
4576 static int ip6_route_multipath_del(struct fib6_config *cfg,
4577                                    struct netlink_ext_ack *extack)
4578 {
4579         struct fib6_config r_cfg;
4580         struct rtnexthop *rtnh;
4581         int remaining;
4582         int attrlen;
4583         int err = 1, last_err = 0;
4584
4585         remaining = cfg->fc_mp_len;
4586         rtnh = (struct rtnexthop *)cfg->fc_mp;
4587
4588         /* Parse a Multipath Entry */
4589         while (rtnh_ok(rtnh, remaining)) {
4590                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4591                 if (rtnh->rtnh_ifindex)
4592                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4593
4594                 attrlen = rtnh_attrlen(rtnh);
4595                 if (attrlen > 0) {
4596                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4597
4598                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4599                         if (nla) {
4600                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4601                                 r_cfg.fc_flags |= RTF_GATEWAY;
4602                         }
4603                 }
4604                 err = ip6_route_del(&r_cfg, extack);
4605                 if (err)
4606                         last_err = err;
4607
4608                 rtnh = rtnh_next(rtnh, &remaining);
4609         }
4610
4611         return last_err;
4612 }
4613
4614 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4615                               struct netlink_ext_ack *extack)
4616 {
4617         struct fib6_config cfg;
4618         int err;
4619
4620         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4621         if (err < 0)
4622                 return err;
4623
4624         if (cfg.fc_mp)
4625                 return ip6_route_multipath_del(&cfg, extack);
4626         else {
4627                 cfg.fc_delete_all_nh = 1;
4628                 return ip6_route_del(&cfg, extack);
4629         }
4630 }
4631
4632 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4633                               struct netlink_ext_ack *extack)
4634 {
4635         struct fib6_config cfg;
4636         int err;
4637
4638         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4639         if (err < 0)
4640                 return err;
4641
4642         if (cfg.fc_metric == 0)
4643                 cfg.fc_metric = IP6_RT_PRIO_USER;
4644
4645         if (cfg.fc_mp)
4646                 return ip6_route_multipath_add(&cfg, extack);
4647         else
4648                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4649 }
4650
4651 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4652 {
4653         int nexthop_len = 0;
4654
4655         if (rt->fib6_nsiblings) {
4656                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4657                             + NLA_ALIGN(sizeof(struct rtnexthop))
4658                             + nla_total_size(16) /* RTA_GATEWAY */
4659                             + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4660
4661                 nexthop_len *= rt->fib6_nsiblings;
4662         }
4663
4664         return NLMSG_ALIGN(sizeof(struct rtmsg))
4665                + nla_total_size(16) /* RTA_SRC */
4666                + nla_total_size(16) /* RTA_DST */
4667                + nla_total_size(16) /* RTA_GATEWAY */
4668                + nla_total_size(16) /* RTA_PREFSRC */
4669                + nla_total_size(4) /* RTA_TABLE */
4670                + nla_total_size(4) /* RTA_IIF */
4671                + nla_total_size(4) /* RTA_OIF */
4672                + nla_total_size(4) /* RTA_PRIORITY */
4673                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4674                + nla_total_size(sizeof(struct rta_cacheinfo))
4675                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4676                + nla_total_size(1) /* RTA_PREF */
4677                + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4678                + nexthop_len;
4679 }
4680
4681 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4682                          struct fib6_info *rt, struct dst_entry *dst,
4683                          struct in6_addr *dest, struct in6_addr *src,
4684                          int iif, int type, u32 portid, u32 seq,
4685                          unsigned int flags)
4686 {
4687         struct rt6_info *rt6 = (struct rt6_info *)dst;
4688         struct rt6key *rt6_dst, *rt6_src;
4689         u32 *pmetrics, table, rt6_flags;
4690         struct nlmsghdr *nlh;
4691         struct rtmsg *rtm;
4692         long expires = 0;
4693
4694         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4695         if (!nlh)
4696                 return -EMSGSIZE;
4697
4698         if (rt6) {
4699                 rt6_dst = &rt6->rt6i_dst;
4700                 rt6_src = &rt6->rt6i_src;
4701                 rt6_flags = rt6->rt6i_flags;
4702         } else {
4703                 rt6_dst = &rt->fib6_dst;
4704                 rt6_src = &rt->fib6_src;
4705                 rt6_flags = rt->fib6_flags;
4706         }
4707
4708         rtm = nlmsg_data(nlh);
4709         rtm->rtm_family = AF_INET6;
4710         rtm->rtm_dst_len = rt6_dst->plen;
4711         rtm->rtm_src_len = rt6_src->plen;
4712         rtm->rtm_tos = 0;
4713         if (rt->fib6_table)
4714                 table = rt->fib6_table->tb6_id;
4715         else
4716                 table = RT6_TABLE_UNSPEC;
4717         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4718         if (nla_put_u32(skb, RTA_TABLE, table))
4719                 goto nla_put_failure;
4720
4721         rtm->rtm_type = rt->fib6_type;
4722         rtm->rtm_flags = 0;
4723         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4724         rtm->rtm_protocol = rt->fib6_protocol;
4725
4726         if (rt6_flags & RTF_CACHE)
4727                 rtm->rtm_flags |= RTM_F_CLONED;
4728
4729         if (dest) {
4730                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4731                         goto nla_put_failure;
4732                 rtm->rtm_dst_len = 128;
4733         } else if (rtm->rtm_dst_len)
4734                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4735                         goto nla_put_failure;
4736 #ifdef CONFIG_IPV6_SUBTREES
4737         if (src) {
4738                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4739                         goto nla_put_failure;
4740                 rtm->rtm_src_len = 128;
4741         } else if (rtm->rtm_src_len &&
4742                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4743                 goto nla_put_failure;
4744 #endif
4745         if (iif) {
4746 #ifdef CONFIG_IPV6_MROUTE
4747                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4748                         int err = ip6mr_get_route(net, skb, rtm, portid);
4749
4750                         if (err == 0)
4751                                 return 0;
4752                         if (err < 0)
4753                                 goto nla_put_failure;
4754                 } else
4755 #endif
4756                         if (nla_put_u32(skb, RTA_IIF, iif))
4757                                 goto nla_put_failure;
4758         } else if (dest) {
4759                 struct in6_addr saddr_buf;
4760                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4761                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4762                         goto nla_put_failure;
4763         }
4764
4765         if (rt->fib6_prefsrc.plen) {
4766                 struct in6_addr saddr_buf;
4767                 saddr_buf = rt->fib6_prefsrc.addr;
4768                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4769                         goto nla_put_failure;
4770         }
4771
4772         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4773         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4774                 goto nla_put_failure;
4775
4776         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4777                 goto nla_put_failure;
4778
4779         /* For multipath routes, walk the siblings list and add
4780          * each as a nexthop within RTA_MULTIPATH.
4781          */
4782         if (rt6) {
4783                 if (rt6_flags & RTF_GATEWAY &&
4784                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4785                         goto nla_put_failure;
4786
4787                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4788                         goto nla_put_failure;
4789         } else if (rt->fib6_nsiblings) {
4790                 struct fib6_info *sibling, *next_sibling;
4791                 struct nlattr *mp;
4792
4793                 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4794                 if (!mp)
4795                         goto nla_put_failure;
4796
4797                 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4798                                     rt->fib6_nh.fib_nh_weight) < 0)
4799                         goto nla_put_failure;
4800
4801                 list_for_each_entry_safe(sibling, next_sibling,
4802                                          &rt->fib6_siblings, fib6_siblings) {
4803                         if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4804                                             sibling->fib6_nh.fib_nh_weight) < 0)
4805                                 goto nla_put_failure;
4806                 }
4807
4808                 nla_nest_end(skb, mp);
4809         } else {
4810                 unsigned char nh_flags = 0;
4811
4812                 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4813                                      &nh_flags, false) < 0)
4814                         goto nla_put_failure;
4815
4816                 rtm->rtm_flags |= nh_flags;
4817         }
4818
4819         if (rt6_flags & RTF_EXPIRES) {
4820                 expires = dst ? dst->expires : rt->expires;
4821                 expires -= jiffies;
4822         }
4823
4824         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4825                 goto nla_put_failure;
4826
4827         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4828                 goto nla_put_failure;
4829
4830
4831         nlmsg_end(skb, nlh);
4832         return 0;
4833
4834 nla_put_failure:
4835         nlmsg_cancel(skb, nlh);
4836         return -EMSGSIZE;
4837 }
4838
4839 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4840                                const struct net_device *dev)
4841 {
4842         if (f6i->fib6_nh.fib_nh_dev == dev)
4843                 return true;
4844
4845         if (f6i->fib6_nsiblings) {
4846                 struct fib6_info *sibling, *next_sibling;
4847
4848                 list_for_each_entry_safe(sibling, next_sibling,
4849                                          &f6i->fib6_siblings, fib6_siblings) {
4850                         if (sibling->fib6_nh.fib_nh_dev == dev)
4851                                 return true;
4852                 }
4853         }
4854
4855         return false;
4856 }
4857
4858 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4859 {
4860         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4861         struct fib_dump_filter *filter = &arg->filter;
4862         unsigned int flags = NLM_F_MULTI;
4863         struct net *net = arg->net;
4864
4865         if (rt == net->ipv6.fib6_null_entry)
4866                 return 0;
4867
4868         if ((filter->flags & RTM_F_PREFIX) &&
4869             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4870                 /* success since this is not a prefix route */
4871                 return 1;
4872         }
4873         if (filter->filter_set) {
4874                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4875                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4876                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4877                         return 1;
4878                 }
4879                 flags |= NLM_F_DUMP_FILTERED;
4880         }
4881
4882         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4883                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4884                              arg->cb->nlh->nlmsg_seq, flags);
4885 }
4886
4887 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4888                                         const struct nlmsghdr *nlh,
4889                                         struct nlattr **tb,
4890                                         struct netlink_ext_ack *extack)
4891 {
4892         struct rtmsg *rtm;
4893         int i, err;
4894
4895         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4896                 NL_SET_ERR_MSG_MOD(extack,
4897                                    "Invalid header for get route request");
4898                 return -EINVAL;
4899         }
4900
4901         if (!netlink_strict_get_check(skb))
4902                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4903                                               rtm_ipv6_policy, extack);
4904
4905         rtm = nlmsg_data(nlh);
4906         if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4907             (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4908             rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4909             rtm->rtm_type) {
4910                 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4911                 return -EINVAL;
4912         }
4913         if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4914                 NL_SET_ERR_MSG_MOD(extack,
4915                                    "Invalid flags for get route request");
4916                 return -EINVAL;
4917         }
4918
4919         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4920                                             rtm_ipv6_policy, extack);
4921         if (err)
4922                 return err;
4923
4924         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4925             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4926                 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4927                 return -EINVAL;
4928         }
4929
4930         for (i = 0; i <= RTA_MAX; i++) {
4931                 if (!tb[i])
4932                         continue;
4933
4934                 switch (i) {
4935                 case RTA_SRC:
4936                 case RTA_DST:
4937                 case RTA_IIF:
4938                 case RTA_OIF:
4939                 case RTA_MARK:
4940                 case RTA_UID:
4941                 case RTA_SPORT:
4942                 case RTA_DPORT:
4943                 case RTA_IP_PROTO:
4944                         break;
4945                 default:
4946                         NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4947                         return -EINVAL;
4948                 }
4949         }
4950
4951         return 0;
4952 }
4953
4954 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4955                               struct netlink_ext_ack *extack)
4956 {
4957         struct net *net = sock_net(in_skb->sk);
4958         struct nlattr *tb[RTA_MAX+1];
4959         int err, iif = 0, oif = 0;
4960         struct fib6_info *from;
4961         struct dst_entry *dst;
4962         struct rt6_info *rt;
4963         struct sk_buff *skb;
4964         struct rtmsg *rtm;
4965         struct flowi6 fl6 = {};
4966         bool fibmatch;
4967
4968         err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4969         if (err < 0)
4970                 goto errout;
4971
4972         err = -EINVAL;
4973         rtm = nlmsg_data(nlh);
4974         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4975         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4976
4977         if (tb[RTA_SRC]) {
4978                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4979                         goto errout;
4980
4981                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4982         }
4983
4984         if (tb[RTA_DST]) {
4985                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4986                         goto errout;
4987
4988                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4989         }
4990
4991         if (tb[RTA_IIF])
4992                 iif = nla_get_u32(tb[RTA_IIF]);
4993
4994         if (tb[RTA_OIF])
4995                 oif = nla_get_u32(tb[RTA_OIF]);
4996
4997         if (tb[RTA_MARK])
4998                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4999
5000         if (tb[RTA_UID])
5001                 fl6.flowi6_uid = make_kuid(current_user_ns(),
5002                                            nla_get_u32(tb[RTA_UID]));
5003         else
5004                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5005
5006         if (tb[RTA_SPORT])
5007                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5008
5009         if (tb[RTA_DPORT])
5010                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5011
5012         if (tb[RTA_IP_PROTO]) {
5013                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5014                                                   &fl6.flowi6_proto, AF_INET6,
5015                                                   extack);
5016                 if (err)
5017                         goto errout;
5018         }
5019
5020         if (iif) {
5021                 struct net_device *dev;
5022                 int flags = 0;
5023
5024                 rcu_read_lock();
5025
5026                 dev = dev_get_by_index_rcu(net, iif);
5027                 if (!dev) {
5028                         rcu_read_unlock();
5029                         err = -ENODEV;
5030                         goto errout;
5031                 }
5032
5033                 fl6.flowi6_iif = iif;
5034
5035                 if (!ipv6_addr_any(&fl6.saddr))
5036                         flags |= RT6_LOOKUP_F_HAS_SADDR;
5037
5038                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5039
5040                 rcu_read_unlock();
5041         } else {
5042                 fl6.flowi6_oif = oif;
5043
5044                 dst = ip6_route_output(net, NULL, &fl6);
5045         }
5046
5047
5048         rt = container_of(dst, struct rt6_info, dst);
5049         if (rt->dst.error) {
5050                 err = rt->dst.error;
5051                 ip6_rt_put(rt);
5052                 goto errout;
5053         }
5054
5055         if (rt == net->ipv6.ip6_null_entry) {
5056                 err = rt->dst.error;
5057                 ip6_rt_put(rt);
5058                 goto errout;
5059         }
5060
5061         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5062         if (!skb) {
5063                 ip6_rt_put(rt);
5064                 err = -ENOBUFS;
5065                 goto errout;
5066         }
5067
5068         skb_dst_set(skb, &rt->dst);
5069
5070         rcu_read_lock();
5071         from = rcu_dereference(rt->from);
5072         if (from) {
5073                 if (fibmatch)
5074                         err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5075                                             iif, RTM_NEWROUTE,
5076                                             NETLINK_CB(in_skb).portid,
5077                                             nlh->nlmsg_seq, 0);
5078                 else
5079                         err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5080                                             &fl6.saddr, iif, RTM_NEWROUTE,
5081                                             NETLINK_CB(in_skb).portid,
5082                                             nlh->nlmsg_seq, 0);
5083         } else {
5084                 err = -ENETUNREACH;
5085         }
5086         rcu_read_unlock();
5087
5088         if (err < 0) {
5089                 kfree_skb(skb);
5090                 goto errout;
5091         }
5092
5093         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5094 errout:
5095         return err;
5096 }
5097
5098 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5099                      unsigned int nlm_flags)
5100 {
5101         struct sk_buff *skb;
5102         struct net *net = info->nl_net;
5103         u32 seq;
5104         int err;
5105
5106         err = -ENOBUFS;
5107         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5108
5109         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5110         if (!skb)
5111                 goto errout;
5112
5113         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5114                             event, info->portid, seq, nlm_flags);
5115         if (err < 0) {
5116                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5117                 WARN_ON(err == -EMSGSIZE);
5118                 kfree_skb(skb);
5119                 goto errout;
5120         }
5121         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5122                     info->nlh, gfp_any());
5123         return;
5124 errout:
5125         if (err < 0)
5126                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5127 }
5128
5129 static int ip6_route_dev_notify(struct notifier_block *this,
5130                                 unsigned long event, void *ptr)
5131 {
5132         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5133         struct net *net = dev_net(dev);
5134
5135         if (!(dev->flags & IFF_LOOPBACK))
5136                 return NOTIFY_OK;
5137
5138         if (event == NETDEV_REGISTER) {
5139                 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5140                 net->ipv6.ip6_null_entry->dst.dev = dev;
5141                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5143                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5144                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5145                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5146                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5147 #endif
5148          } else if (event == NETDEV_UNREGISTER &&
5149                     dev->reg_state != NETREG_UNREGISTERED) {
5150                 /* NETDEV_UNREGISTER could be fired for multiple times by
5151                  * netdev_wait_allrefs(). Make sure we only call this once.
5152                  */
5153                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5154 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5155                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5156                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5157 #endif
5158         }
5159
5160         return NOTIFY_OK;
5161 }
5162
5163 /*
5164  *      /proc
5165  */
5166
5167 #ifdef CONFIG_PROC_FS
5168 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5169 {
5170         struct net *net = (struct net *)seq->private;
5171         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5172                    net->ipv6.rt6_stats->fib_nodes,
5173                    net->ipv6.rt6_stats->fib_route_nodes,
5174                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5175                    net->ipv6.rt6_stats->fib_rt_entries,
5176                    net->ipv6.rt6_stats->fib_rt_cache,
5177                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5178                    net->ipv6.rt6_stats->fib_discarded_routes);
5179
5180         return 0;
5181 }
5182 #endif  /* CONFIG_PROC_FS */
5183
5184 #ifdef CONFIG_SYSCTL
5185
5186 static
5187 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5188                               void __user *buffer, size_t *lenp, loff_t *ppos)
5189 {
5190         struct net *net;
5191         int delay;
5192         int ret;
5193         if (!write)
5194                 return -EINVAL;
5195
5196         net = (struct net *)ctl->extra1;
5197         delay = net->ipv6.sysctl.flush_delay;
5198         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5199         if (ret)
5200                 return ret;
5201
5202         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5203         return 0;
5204 }
5205
5206 static int zero;
5207 static int one = 1;
5208
5209 static struct ctl_table ipv6_route_table_template[] = {
5210         {
5211                 .procname       =       "flush",
5212                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5213                 .maxlen         =       sizeof(int),
5214                 .mode           =       0200,
5215                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5216         },
5217         {
5218                 .procname       =       "gc_thresh",
5219                 .data           =       &ip6_dst_ops_template.gc_thresh,
5220                 .maxlen         =       sizeof(int),
5221                 .mode           =       0644,
5222                 .proc_handler   =       proc_dointvec,
5223         },
5224         {
5225                 .procname       =       "max_size",
5226                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5227                 .maxlen         =       sizeof(int),
5228                 .mode           =       0644,
5229                 .proc_handler   =       proc_dointvec,
5230         },
5231         {
5232                 .procname       =       "gc_min_interval",
5233                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5234                 .maxlen         =       sizeof(int),
5235                 .mode           =       0644,
5236                 .proc_handler   =       proc_dointvec_jiffies,
5237         },
5238         {
5239                 .procname       =       "gc_timeout",
5240                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5241                 .maxlen         =       sizeof(int),
5242                 .mode           =       0644,
5243                 .proc_handler   =       proc_dointvec_jiffies,
5244         },
5245         {
5246                 .procname       =       "gc_interval",
5247                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5248                 .maxlen         =       sizeof(int),
5249                 .mode           =       0644,
5250                 .proc_handler   =       proc_dointvec_jiffies,
5251         },
5252         {
5253                 .procname       =       "gc_elasticity",
5254                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5255                 .maxlen         =       sizeof(int),
5256                 .mode           =       0644,
5257                 .proc_handler   =       proc_dointvec,
5258         },
5259         {
5260                 .procname       =       "mtu_expires",
5261                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5262                 .maxlen         =       sizeof(int),
5263                 .mode           =       0644,
5264                 .proc_handler   =       proc_dointvec_jiffies,
5265         },
5266         {
5267                 .procname       =       "min_adv_mss",
5268                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5269                 .maxlen         =       sizeof(int),
5270                 .mode           =       0644,
5271                 .proc_handler   =       proc_dointvec,
5272         },
5273         {
5274                 .procname       =       "gc_min_interval_ms",
5275                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5276                 .maxlen         =       sizeof(int),
5277                 .mode           =       0644,
5278                 .proc_handler   =       proc_dointvec_ms_jiffies,
5279         },
5280         {
5281                 .procname       =       "skip_notify_on_dev_down",
5282                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5283                 .maxlen         =       sizeof(int),
5284                 .mode           =       0644,
5285                 .proc_handler   =       proc_dointvec_minmax,
5286                 .extra1         =       &zero,
5287                 .extra2         =       &one,
5288         },
5289         { }
5290 };
5291
5292 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5293 {
5294         struct ctl_table *table;
5295
5296         table = kmemdup(ipv6_route_table_template,
5297                         sizeof(ipv6_route_table_template),
5298                         GFP_KERNEL);
5299
5300         if (table) {
5301                 table[0].data = &net->ipv6.sysctl.flush_delay;
5302                 table[0].extra1 = net;
5303                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5304                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5305                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5306                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5307                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5308                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5309                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5310                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5311                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5312                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5313
5314                 /* Don't export sysctls to unprivileged users */
5315                 if (net->user_ns != &init_user_ns)
5316                         table[0].procname = NULL;
5317         }
5318
5319         return table;
5320 }
5321 #endif
5322
5323 static int __net_init ip6_route_net_init(struct net *net)
5324 {
5325         int ret = -ENOMEM;
5326
5327         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5328                sizeof(net->ipv6.ip6_dst_ops));
5329
5330         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5331                 goto out_ip6_dst_ops;
5332
5333         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5334                                             sizeof(*net->ipv6.fib6_null_entry),
5335                                             GFP_KERNEL);
5336         if (!net->ipv6.fib6_null_entry)
5337                 goto out_ip6_dst_entries;
5338
5339         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5340                                            sizeof(*net->ipv6.ip6_null_entry),
5341                                            GFP_KERNEL);
5342         if (!net->ipv6.ip6_null_entry)
5343                 goto out_fib6_null_entry;
5344         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5345         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5346                          ip6_template_metrics, true);
5347
5348 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5349         net->ipv6.fib6_has_custom_rules = false;
5350         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5351                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5352                                                GFP_KERNEL);
5353         if (!net->ipv6.ip6_prohibit_entry)
5354                 goto out_ip6_null_entry;
5355         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5356         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5357                          ip6_template_metrics, true);
5358
5359         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5360                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5361                                                GFP_KERNEL);
5362         if (!net->ipv6.ip6_blk_hole_entry)
5363                 goto out_ip6_prohibit_entry;
5364         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5365         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5366                          ip6_template_metrics, true);
5367 #endif
5368
5369         net->ipv6.sysctl.flush_delay = 0;
5370         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5371         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5372         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5373         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5374         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5375         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5376         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5377         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5378
5379         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5380
5381         ret = 0;
5382 out:
5383         return ret;
5384
5385 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5386 out_ip6_prohibit_entry:
5387         kfree(net->ipv6.ip6_prohibit_entry);
5388 out_ip6_null_entry:
5389         kfree(net->ipv6.ip6_null_entry);
5390 #endif
5391 out_fib6_null_entry:
5392         kfree(net->ipv6.fib6_null_entry);
5393 out_ip6_dst_entries:
5394         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5395 out_ip6_dst_ops:
5396         goto out;
5397 }
5398
5399 static void __net_exit ip6_route_net_exit(struct net *net)
5400 {
5401         kfree(net->ipv6.fib6_null_entry);
5402         kfree(net->ipv6.ip6_null_entry);
5403 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5404         kfree(net->ipv6.ip6_prohibit_entry);
5405         kfree(net->ipv6.ip6_blk_hole_entry);
5406 #endif
5407         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5408 }
5409
5410 static int __net_init ip6_route_net_init_late(struct net *net)
5411 {
5412 #ifdef CONFIG_PROC_FS
5413         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5414                         sizeof(struct ipv6_route_iter));
5415         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5416                         rt6_stats_seq_show, NULL);
5417 #endif
5418         return 0;
5419 }
5420
5421 static void __net_exit ip6_route_net_exit_late(struct net *net)
5422 {
5423 #ifdef CONFIG_PROC_FS
5424         remove_proc_entry("ipv6_route", net->proc_net);
5425         remove_proc_entry("rt6_stats", net->proc_net);
5426 #endif
5427 }
5428
5429 static struct pernet_operations ip6_route_net_ops = {
5430         .init = ip6_route_net_init,
5431         .exit = ip6_route_net_exit,
5432 };
5433
5434 static int __net_init ipv6_inetpeer_init(struct net *net)
5435 {
5436         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5437
5438         if (!bp)
5439                 return -ENOMEM;
5440         inet_peer_base_init(bp);
5441         net->ipv6.peers = bp;
5442         return 0;
5443 }
5444
5445 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5446 {
5447         struct inet_peer_base *bp = net->ipv6.peers;
5448
5449         net->ipv6.peers = NULL;
5450         inetpeer_invalidate_tree(bp);
5451         kfree(bp);
5452 }
5453
5454 static struct pernet_operations ipv6_inetpeer_ops = {
5455         .init   =       ipv6_inetpeer_init,
5456         .exit   =       ipv6_inetpeer_exit,
5457 };
5458
5459 static struct pernet_operations ip6_route_net_late_ops = {
5460         .init = ip6_route_net_init_late,
5461         .exit = ip6_route_net_exit_late,
5462 };
5463
5464 static struct notifier_block ip6_route_dev_notifier = {
5465         .notifier_call = ip6_route_dev_notify,
5466         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5467 };
5468
5469 void __init ip6_route_init_special_entries(void)
5470 {
5471         /* Registering of the loopback is done before this portion of code,
5472          * the loopback reference in rt6_info will not be taken, do it
5473          * manually for init_net */
5474         init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5475         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5476         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5477   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5478         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5479         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5480         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5481         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5482   #endif
5483 }
5484
5485 int __init ip6_route_init(void)
5486 {
5487         int ret;
5488         int cpu;
5489
5490         ret = -ENOMEM;
5491         ip6_dst_ops_template.kmem_cachep =
5492                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5493                                   SLAB_HWCACHE_ALIGN, NULL);
5494         if (!ip6_dst_ops_template.kmem_cachep)
5495                 goto out;
5496
5497         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5498         if (ret)
5499                 goto out_kmem_cache;
5500
5501         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5502         if (ret)
5503                 goto out_dst_entries;
5504
5505         ret = register_pernet_subsys(&ip6_route_net_ops);
5506         if (ret)
5507                 goto out_register_inetpeer;
5508
5509         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5510
5511         ret = fib6_init();
5512         if (ret)
5513                 goto out_register_subsys;
5514
5515         ret = xfrm6_init();
5516         if (ret)
5517                 goto out_fib6_init;
5518
5519         ret = fib6_rules_init();
5520         if (ret)
5521                 goto xfrm6_init;
5522
5523         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5524         if (ret)
5525                 goto fib6_rules_init;
5526
5527         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5528                                    inet6_rtm_newroute, NULL, 0);
5529         if (ret < 0)
5530                 goto out_register_late_subsys;
5531
5532         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5533                                    inet6_rtm_delroute, NULL, 0);
5534         if (ret < 0)
5535                 goto out_register_late_subsys;
5536
5537         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5538                                    inet6_rtm_getroute, NULL,
5539                                    RTNL_FLAG_DOIT_UNLOCKED);
5540         if (ret < 0)
5541                 goto out_register_late_subsys;
5542
5543         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5544         if (ret)
5545                 goto out_register_late_subsys;
5546
5547         for_each_possible_cpu(cpu) {
5548                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5549
5550                 INIT_LIST_HEAD(&ul->head);
5551                 spin_lock_init(&ul->lock);
5552         }
5553
5554 out:
5555         return ret;
5556
5557 out_register_late_subsys:
5558         rtnl_unregister_all(PF_INET6);
5559         unregister_pernet_subsys(&ip6_route_net_late_ops);
5560 fib6_rules_init:
5561         fib6_rules_cleanup();
5562 xfrm6_init:
5563         xfrm6_fini();
5564 out_fib6_init:
5565         fib6_gc_cleanup();
5566 out_register_subsys:
5567         unregister_pernet_subsys(&ip6_route_net_ops);
5568 out_register_inetpeer:
5569         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5570 out_dst_entries:
5571         dst_entries_destroy(&ip6_dst_blackhole_ops);
5572 out_kmem_cache:
5573         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5574         goto out;
5575 }
5576
5577 void ip6_route_cleanup(void)
5578 {
5579         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5580         unregister_pernet_subsys(&ip6_route_net_late_ops);
5581         fib6_rules_cleanup();
5582         xfrm6_fini();
5583         fib6_gc_cleanup();
5584         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5585         unregister_pernet_subsys(&ip6_route_net_ops);
5586         dst_entries_destroy(&ip6_dst_blackhole_ops);
5587         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5588 }
This page took 0.348759 seconds and 4 git commands to generate.