]> Git Repo - linux.git/blob - net/ipv6/route.c
net: hns3: Add handling of GRO Pkts not fully RX'ed in NAPI poll
[linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <[email protected]>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 static int ip6_rt_type_to_error(u8 fib6_type);
74
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79
80 enum rt6_nud_state {
81         RT6_NUD_FAIL_HARD = -3,
82         RT6_NUD_FAIL_PROBE = -2,
83         RT6_NUD_FAIL_DO_RR = -1,
84         RT6_NUD_SUCCEED = 1
85 };
86
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int      ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(struct dst_ops *ops);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int              ip6_pkt_prohibit(struct sk_buff *skb);
99 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102                                            struct sk_buff *skb, u32 mtu);
103 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104                                         struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108                          struct fib6_info *rt, struct dst_entry *dst,
109                          struct in6_addr *dest, struct in6_addr *src,
110                          int iif, int type, u32 portid, u32 seq,
111                          unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113                                            struct in6_addr *daddr,
114                                            struct in6_addr *saddr);
115
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118                                            const struct in6_addr *prefix, int prefixlen,
119                                            const struct in6_addr *gwaddr,
120                                            struct net_device *dev,
121                                            unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123                                            const struct in6_addr *prefix, int prefixlen,
124                                            const struct in6_addr *gwaddr,
125                                            struct net_device *dev);
126 #endif
127
128 struct uncached_list {
129         spinlock_t              lock;
130         struct list_head        head;
131 };
132
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138
139         rt->rt6i_uncached_list = ul;
140
141         spin_lock_bh(&ul->lock);
142         list_add_tail(&rt->rt6i_uncached, &ul->head);
143         spin_unlock_bh(&ul->lock);
144 }
145
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148         if (!list_empty(&rt->rt6i_uncached)) {
149                 struct uncached_list *ul = rt->rt6i_uncached_list;
150                 struct net *net = dev_net(rt->dst.dev);
151
152                 spin_lock_bh(&ul->lock);
153                 list_del(&rt->rt6i_uncached);
154                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155                 spin_unlock_bh(&ul->lock);
156         }
157 }
158
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161         struct net_device *loopback_dev = net->loopback_dev;
162         int cpu;
163
164         if (dev == loopback_dev)
165                 return;
166
167         for_each_possible_cpu(cpu) {
168                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169                 struct rt6_info *rt;
170
171                 spin_lock_bh(&ul->lock);
172                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173                         struct inet6_dev *rt_idev = rt->rt6i_idev;
174                         struct net_device *rt_dev = rt->dst.dev;
175
176                         if (rt_idev->dev == dev) {
177                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
178                                 in6_dev_put(rt_idev);
179                         }
180
181                         if (rt_dev == dev) {
182                                 rt->dst.dev = loopback_dev;
183                                 dev_hold(rt->dst.dev);
184                                 dev_put(rt_dev);
185                         }
186                 }
187                 spin_unlock_bh(&ul->lock);
188         }
189 }
190
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192                                              struct sk_buff *skb,
193                                              const void *daddr)
194 {
195         if (!ipv6_addr_any(p))
196                 return (const void *) p;
197         else if (skb)
198                 return &ipv6_hdr(skb)->daddr;
199         return daddr;
200 }
201
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203                                    struct net_device *dev,
204                                    struct sk_buff *skb,
205                                    const void *daddr)
206 {
207         struct neighbour *n;
208
209         daddr = choose_neigh_daddr(gw, skb, daddr);
210         n = __ipv6_neigh_lookup(dev, daddr);
211         if (n)
212                 return n;
213         return neigh_create(&nd_tbl, daddr, dev);
214 }
215
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
217                                               struct sk_buff *skb,
218                                               const void *daddr)
219 {
220         const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221
222         return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 }
224
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
226 {
227         struct net_device *dev = dst->dev;
228         struct rt6_info *rt = (struct rt6_info *)dst;
229
230         daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231         if (!daddr)
232                 return;
233         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
234                 return;
235         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
236                 return;
237         __ipv6_confirm_neigh(dev, daddr);
238 }
239
240 static struct dst_ops ip6_dst_ops_template = {
241         .family                 =       AF_INET6,
242         .gc                     =       ip6_dst_gc,
243         .gc_thresh              =       1024,
244         .check                  =       ip6_dst_check,
245         .default_advmss         =       ip6_default_advmss,
246         .mtu                    =       ip6_mtu,
247         .cow_metrics            =       dst_cow_metrics_generic,
248         .destroy                =       ip6_dst_destroy,
249         .ifdown                 =       ip6_dst_ifdown,
250         .negative_advice        =       ip6_negative_advice,
251         .link_failure           =       ip6_link_failure,
252         .update_pmtu            =       ip6_rt_update_pmtu,
253         .redirect               =       rt6_do_redirect,
254         .local_out              =       __ip6_local_out,
255         .neigh_lookup           =       ip6_dst_neigh_lookup,
256         .confirm_neigh          =       ip6_confirm_neigh,
257 };
258
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260 {
261         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
262
263         return mtu ? : dst->dev->mtu;
264 }
265
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267                                          struct sk_buff *skb, u32 mtu)
268 {
269 }
270
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
272                                       struct sk_buff *skb)
273 {
274 }
275
276 static struct dst_ops ip6_dst_blackhole_ops = {
277         .family                 =       AF_INET6,
278         .destroy                =       ip6_dst_destroy,
279         .check                  =       ip6_dst_check,
280         .mtu                    =       ip6_blackhole_mtu,
281         .default_advmss         =       ip6_default_advmss,
282         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
283         .redirect               =       ip6_rt_blackhole_redirect,
284         .cow_metrics            =       dst_cow_metrics_generic,
285         .neigh_lookup           =       ip6_dst_neigh_lookup,
286 };
287
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289         [RTAX_HOPLIMIT - 1] = 0,
290 };
291
292 static const struct fib6_info fib6_null_entry_template = {
293         .fib6_flags     = (RTF_REJECT | RTF_NONEXTHOP),
294         .fib6_protocol  = RTPROT_KERNEL,
295         .fib6_metric    = ~(u32)0,
296         .fib6_ref       = ATOMIC_INIT(1),
297         .fib6_type      = RTN_UNREACHABLE,
298         .fib6_metrics   = (struct dst_metrics *)&dst_default_metrics,
299 };
300
301 static const struct rt6_info ip6_null_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -ENETUNREACH,
307                 .input          = ip6_pkt_discard,
308                 .output         = ip6_pkt_discard_out,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325 };
326
327 static const struct rt6_info ip6_blk_hole_entry_template = {
328         .dst = {
329                 .__refcnt       = ATOMIC_INIT(1),
330                 .__use          = 1,
331                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
332                 .error          = -EINVAL,
333                 .input          = dst_discard,
334                 .output         = dst_discard_out,
335         },
336         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
337 };
338
339 #endif
340
341 static void rt6_info_init(struct rt6_info *rt)
342 {
343         struct dst_entry *dst = &rt->dst;
344
345         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346         INIT_LIST_HEAD(&rt->rt6i_uncached);
347 }
348
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
351                                int flags)
352 {
353         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354                                         1, DST_OBSOLETE_FORCE_CHK, flags);
355
356         if (rt) {
357                 rt6_info_init(rt);
358                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
359         }
360
361         return rt;
362 }
363 EXPORT_SYMBOL(ip6_dst_alloc);
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct fib6_info *from;
369         struct inet6_dev *idev;
370
371         ip_dst_metrics_put(dst);
372         rt6_uncached_list_del(rt);
373
374         idev = rt->rt6i_idev;
375         if (idev) {
376                 rt->rt6i_idev = NULL;
377                 in6_dev_put(idev);
378         }
379
380         rcu_read_lock();
381         from = rcu_dereference(rt->from);
382         rcu_assign_pointer(rt->from, NULL);
383         fib6_info_release(from);
384         rcu_read_unlock();
385 }
386
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388                            int how)
389 {
390         struct rt6_info *rt = (struct rt6_info *)dst;
391         struct inet6_dev *idev = rt->rt6i_idev;
392         struct net_device *loopback_dev =
393                 dev_net(dev)->loopback_dev;
394
395         if (idev && idev->dev != loopback_dev) {
396                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397                 if (loopback_idev) {
398                         rt->rt6i_idev = loopback_idev;
399                         in6_dev_put(idev);
400                 }
401         }
402 }
403
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406         if (rt->rt6i_flags & RTF_EXPIRES)
407                 return time_after(jiffies, rt->dst.expires);
408         else
409                 return false;
410 }
411
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414         struct fib6_info *from;
415
416         from = rcu_dereference(rt->from);
417
418         if (rt->rt6i_flags & RTF_EXPIRES) {
419                 if (time_after(jiffies, rt->dst.expires))
420                         return true;
421         } else if (from) {
422                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423                         fib6_check_expired(from);
424         }
425         return false;
426 }
427
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429                                         struct fib6_info *match,
430                                         struct flowi6 *fl6, int oif,
431                                         const struct sk_buff *skb,
432                                         int strict)
433 {
434         struct fib6_info *sibling, *next_sibling;
435
436         /* We might have already computed the hash for ICMPv6 errors. In such
437          * case it will always be non-zero. Otherwise now is the time to do it.
438          */
439         if (!fl6->mp_hash)
440                 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441
442         if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
443                 return match;
444
445         list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446                                  fib6_siblings) {
447                 int nh_upper_bound;
448
449                 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450                 if (fl6->mp_hash > nh_upper_bound)
451                         continue;
452                 if (rt6_score_route(sibling, oif, strict) < 0)
453                         break;
454                 match = sibling;
455                 break;
456         }
457
458         return match;
459 }
460
461 /*
462  *      Route lookup. rcu_read_lock() should be held.
463  */
464
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466                                                  struct fib6_info *rt,
467                                                     const struct in6_addr *saddr,
468                                                     int oif,
469                                                     int flags)
470 {
471         struct fib6_info *sprt;
472
473         if (!oif && ipv6_addr_any(saddr) &&
474             !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
475                 return rt;
476
477         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478                 const struct net_device *dev = sprt->fib6_nh.nh_dev;
479
480                 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
481                         continue;
482
483                 if (oif) {
484                         if (dev->ifindex == oif)
485                                 return sprt;
486                 } else {
487                         if (ipv6_chk_addr(net, saddr, dev,
488                                           flags & RT6_LOOKUP_F_IFACE))
489                                 return sprt;
490                 }
491         }
492
493         if (oif && flags & RT6_LOOKUP_F_IFACE)
494                 return net->ipv6.fib6_null_entry;
495
496         return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
497 }
498
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501         struct work_struct work;
502         struct in6_addr target;
503         struct net_device *dev;
504 };
505
506 static void rt6_probe_deferred(struct work_struct *w)
507 {
508         struct in6_addr mcaddr;
509         struct __rt6_probe_work *work =
510                 container_of(w, struct __rt6_probe_work, work);
511
512         addrconf_addr_solict_mult(&work->target, &mcaddr);
513         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
514         dev_put(work->dev);
515         kfree(work);
516 }
517
518 static void rt6_probe(struct fib6_info *rt)
519 {
520         struct __rt6_probe_work *work = NULL;
521         const struct in6_addr *nh_gw;
522         struct neighbour *neigh;
523         struct net_device *dev;
524         struct inet6_dev *idev;
525
526         /*
527          * Okay, this does not seem to be appropriate
528          * for now, however, we need to check if it
529          * is really so; aka Router Reachability Probing.
530          *
531          * Router Reachability Probe MUST be rate-limited
532          * to no more than one per minute.
533          */
534         if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
535                 return;
536
537         nh_gw = &rt->fib6_nh.nh_gw;
538         dev = rt->fib6_nh.nh_dev;
539         rcu_read_lock_bh();
540         idev = __in6_dev_get(dev);
541         neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
542         if (neigh) {
543                 if (neigh->nud_state & NUD_VALID)
544                         goto out;
545
546                 write_lock(&neigh->lock);
547                 if (!(neigh->nud_state & NUD_VALID) &&
548                     time_after(jiffies,
549                                neigh->updated + idev->cnf.rtr_probe_interval)) {
550                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
551                         if (work)
552                                 __neigh_set_probe_once(neigh);
553                 }
554                 write_unlock(&neigh->lock);
555         } else if (time_after(jiffies, rt->last_probe +
556                                        idev->cnf.rtr_probe_interval)) {
557                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
558         }
559
560         if (work) {
561                 rt->last_probe = jiffies;
562                 INIT_WORK(&work->work, rt6_probe_deferred);
563                 work->target = *nh_gw;
564                 dev_hold(dev);
565                 work->dev = dev;
566                 schedule_work(&work->work);
567         }
568
569 out:
570         rcu_read_unlock_bh();
571 }
572 #else
573 static inline void rt6_probe(struct fib6_info *rt)
574 {
575 }
576 #endif
577
578 /*
579  * Default Router Selection (RFC 2461 6.3.6)
580  */
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
582 {
583         const struct net_device *dev = rt->fib6_nh.nh_dev;
584
585         if (!oif || dev->ifindex == oif)
586                 return 2;
587         return 0;
588 }
589
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
591 {
592         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593         struct neighbour *neigh;
594
595         if (rt->fib6_flags & RTF_NONEXTHOP ||
596             !(rt->fib6_flags & RTF_GATEWAY))
597                 return RT6_NUD_SUCCEED;
598
599         rcu_read_lock_bh();
600         neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
601                                           &rt->fib6_nh.nh_gw);
602         if (neigh) {
603                 read_lock(&neigh->lock);
604                 if (neigh->nud_state & NUD_VALID)
605                         ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607                 else if (!(neigh->nud_state & NUD_FAILED))
608                         ret = RT6_NUD_SUCCEED;
609                 else
610                         ret = RT6_NUD_FAIL_PROBE;
611 #endif
612                 read_unlock(&neigh->lock);
613         } else {
614                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
616         }
617         rcu_read_unlock_bh();
618
619         return ret;
620 }
621
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
623 {
624         int m;
625
626         m = rt6_check_dev(rt, oif);
627         if (!m && (strict & RT6_LOOKUP_F_IFACE))
628                 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
631 #endif
632         if (strict & RT6_LOOKUP_F_REACHABLE) {
633                 int n = rt6_check_neigh(rt);
634                 if (n < 0)
635                         return n;
636         }
637         return m;
638 }
639
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
642 {
643         const struct net_device *dev = fib6_info_nh_dev(f6i);
644         bool rc = false;
645
646         if (dev) {
647                 const struct inet6_dev *idev = __in6_dev_get(dev);
648
649                 rc = !!idev->cnf.ignore_routes_with_linkdown;
650         }
651
652         return rc;
653 }
654
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656                                    int *mpri, struct fib6_info *match,
657                                    bool *do_rr)
658 {
659         int m;
660         bool match_do_rr = false;
661
662         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
663                 goto out;
664
665         if (fib6_ignore_linkdown(rt) &&
666             rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
668                 goto out;
669
670         if (fib6_check_expired(rt))
671                 goto out;
672
673         m = rt6_score_route(rt, oif, strict);
674         if (m == RT6_NUD_FAIL_DO_RR) {
675                 match_do_rr = true;
676                 m = 0; /* lowest valid score */
677         } else if (m == RT6_NUD_FAIL_HARD) {
678                 goto out;
679         }
680
681         if (strict & RT6_LOOKUP_F_REACHABLE)
682                 rt6_probe(rt);
683
684         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
685         if (m > *mpri) {
686                 *do_rr = match_do_rr;
687                 *mpri = m;
688                 match = rt;
689         }
690 out:
691         return match;
692 }
693
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695                                      struct fib6_info *leaf,
696                                      struct fib6_info *rr_head,
697                                      u32 metric, int oif, int strict,
698                                      bool *do_rr)
699 {
700         struct fib6_info *rt, *match, *cont;
701         int mpri = -1;
702
703         match = NULL;
704         cont = NULL;
705         for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706                 if (rt->fib6_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         for (rt = leaf; rt && rt != rr_head;
715              rt = rcu_dereference(rt->fib6_next)) {
716                 if (rt->fib6_metric != metric) {
717                         cont = rt;
718                         break;
719                 }
720
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722         }
723
724         if (match || !cont)
725                 return match;
726
727         for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729
730         return match;
731 }
732
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734                                    int oif, int strict)
735 {
736         struct fib6_info *leaf = rcu_dereference(fn->leaf);
737         struct fib6_info *match, *rt0;
738         bool do_rr = false;
739         int key_plen;
740
741         if (!leaf || leaf == net->ipv6.fib6_null_entry)
742                 return net->ipv6.fib6_null_entry;
743
744         rt0 = rcu_dereference(fn->rr_ptr);
745         if (!rt0)
746                 rt0 = leaf;
747
748         /* Double check to make sure fn is not an intermediate node
749          * and fn->leaf does not points to its child's leaf
750          * (This might happen if all routes under fn are deleted from
751          * the tree and fib6_repair_tree() is called on the node.)
752          */
753         key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755         if (rt0->fib6_src.plen)
756                 key_plen = rt0->fib6_src.plen;
757 #endif
758         if (fn->fn_bit != key_plen)
759                 return net->ipv6.fib6_null_entry;
760
761         match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762                              &do_rr);
763
764         if (do_rr) {
765                 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766
767                 /* no entries matched; do round-robin */
768                 if (!next || next->fib6_metric != rt0->fib6_metric)
769                         next = leaf;
770
771                 if (next != rt0) {
772                         spin_lock_bh(&leaf->fib6_table->tb6_lock);
773                         /* make sure next is not being deleted from the tree */
774                         if (next->fib6_node)
775                                 rcu_assign_pointer(fn->rr_ptr, next);
776                         spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777                 }
778         }
779
780         return match ? match : net->ipv6.fib6_null_entry;
781 }
782
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785         return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
786 }
787
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790                   const struct in6_addr *gwaddr)
791 {
792         struct net *net = dev_net(dev);
793         struct route_info *rinfo = (struct route_info *) opt;
794         struct in6_addr prefix_buf, *prefix;
795         unsigned int pref;
796         unsigned long lifetime;
797         struct fib6_info *rt;
798
799         if (len < sizeof(struct route_info)) {
800                 return -EINVAL;
801         }
802
803         /* Sanity check for prefix_len and length */
804         if (rinfo->length > 3) {
805                 return -EINVAL;
806         } else if (rinfo->prefix_len > 128) {
807                 return -EINVAL;
808         } else if (rinfo->prefix_len > 64) {
809                 if (rinfo->length < 2) {
810                         return -EINVAL;
811                 }
812         } else if (rinfo->prefix_len > 0) {
813                 if (rinfo->length < 1) {
814                         return -EINVAL;
815                 }
816         }
817
818         pref = rinfo->route_pref;
819         if (pref == ICMPV6_ROUTER_PREF_INVALID)
820                 return -EINVAL;
821
822         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823
824         if (rinfo->length == 3)
825                 prefix = (struct in6_addr *)rinfo->prefix;
826         else {
827                 /* this function is safe */
828                 ipv6_addr_prefix(&prefix_buf,
829                                  (struct in6_addr *)rinfo->prefix,
830                                  rinfo->prefix_len);
831                 prefix = &prefix_buf;
832         }
833
834         if (rinfo->prefix_len == 0)
835                 rt = rt6_get_dflt_router(net, gwaddr, dev);
836         else
837                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838                                         gwaddr, dev);
839
840         if (rt && !lifetime) {
841                 ip6_del_rt(net, rt);
842                 rt = NULL;
843         }
844
845         if (!rt && lifetime)
846                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847                                         dev, pref);
848         else if (rt)
849                 rt->fib6_flags = RTF_ROUTEINFO |
850                                  (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851
852         if (rt) {
853                 if (!addrconf_finite_timeout(lifetime))
854                         fib6_clean_expires(rt);
855                 else
856                         fib6_set_expires(rt, jiffies + HZ * lifetime);
857
858                 fib6_info_release(rt);
859         }
860         return 0;
861 }
862 #endif
863
864 /*
865  *      Misc support functions
866  */
867
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871         struct net_device *dev = rt->fib6_nh.nh_dev;
872
873         if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874                 /* for copies of local routes, dst->dev needs to be the
875                  * device if it is a master device, the master device if
876                  * device is enslaved, and the loopback as the default
877                  */
878                 if (netif_is_l3_slave(dev) &&
879                     !rt6_need_strict(&rt->fib6_dst.addr))
880                         dev = l3mdev_master_dev_rcu(dev);
881                 else if (!netif_is_l3_master(dev))
882                         dev = dev_net(dev)->loopback_dev;
883                 /* last case is netif_is_l3_master(dev) is true in which
884                  * case we want dev returned to be dev
885                  */
886         }
887
888         return dev;
889 }
890
891 static const int fib6_prop[RTN_MAX + 1] = {
892         [RTN_UNSPEC]    = 0,
893         [RTN_UNICAST]   = 0,
894         [RTN_LOCAL]     = 0,
895         [RTN_BROADCAST] = 0,
896         [RTN_ANYCAST]   = 0,
897         [RTN_MULTICAST] = 0,
898         [RTN_BLACKHOLE] = -EINVAL,
899         [RTN_UNREACHABLE] = -EHOSTUNREACH,
900         [RTN_PROHIBIT]  = -EACCES,
901         [RTN_THROW]     = -EAGAIN,
902         [RTN_NAT]       = -EINVAL,
903         [RTN_XRESOLVE]  = -EINVAL,
904 };
905
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908         return fib6_prop[fib6_type];
909 }
910
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913         unsigned short flags = 0;
914
915         if (rt->dst_nocount)
916                 flags |= DST_NOCOUNT;
917         if (rt->dst_nopolicy)
918                 flags |= DST_NOPOLICY;
919         if (rt->dst_host)
920                 flags |= DST_HOST;
921
922         return flags;
923 }
924
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927         rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928
929         switch (ort->fib6_type) {
930         case RTN_BLACKHOLE:
931                 rt->dst.output = dst_discard_out;
932                 rt->dst.input = dst_discard;
933                 break;
934         case RTN_PROHIBIT:
935                 rt->dst.output = ip6_pkt_prohibit_out;
936                 rt->dst.input = ip6_pkt_prohibit;
937                 break;
938         case RTN_THROW:
939         case RTN_UNREACHABLE:
940         default:
941                 rt->dst.output = ip6_pkt_discard_out;
942                 rt->dst.input = ip6_pkt_discard;
943                 break;
944         }
945 }
946
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949         if (ort->fib6_flags & RTF_REJECT) {
950                 ip6_rt_init_dst_reject(rt, ort);
951                 return;
952         }
953
954         rt->dst.error = 0;
955         rt->dst.output = ip6_output;
956
957         if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958                 rt->dst.input = ip6_input;
959         } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960                 rt->dst.input = ip6_mc_input;
961         } else {
962                 rt->dst.input = ip6_forward;
963         }
964
965         if (ort->fib6_nh.nh_lwtstate) {
966                 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
967                 lwtunnel_set_redirect(&rt->dst);
968         }
969
970         rt->dst.lastuse = jiffies;
971 }
972
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
975 {
976         rt->rt6i_flags &= ~RTF_EXPIRES;
977         rcu_assign_pointer(rt->from, from);
978         ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
979 }
980
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 {
984         struct net_device *dev = fib6_info_nh_dev(ort);
985
986         ip6_rt_init_dst(rt, ort);
987
988         rt->rt6i_dst = ort->fib6_dst;
989         rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990         rt->rt6i_gateway = ort->fib6_nh.nh_gw;
991         rt->rt6i_flags = ort->fib6_flags;
992         rt6_set_from(rt, ort);
993 #ifdef CONFIG_IPV6_SUBTREES
994         rt->rt6i_src = ort->fib6_src;
995 #endif
996 }
997
998 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
999                                         struct in6_addr *saddr)
1000 {
1001         struct fib6_node *pn, *sn;
1002         while (1) {
1003                 if (fn->fn_flags & RTN_TL_ROOT)
1004                         return NULL;
1005                 pn = rcu_dereference(fn->parent);
1006                 sn = FIB6_SUBTREE(pn);
1007                 if (sn && sn != fn)
1008                         fn = fib6_node_lookup(sn, NULL, saddr);
1009                 else
1010                         fn = pn;
1011                 if (fn->fn_flags & RTN_RTINFO)
1012                         return fn;
1013         }
1014 }
1015
1016 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1017                           bool null_fallback)
1018 {
1019         struct rt6_info *rt = *prt;
1020
1021         if (dst_hold_safe(&rt->dst))
1022                 return true;
1023         if (null_fallback) {
1024                 rt = net->ipv6.ip6_null_entry;
1025                 dst_hold(&rt->dst);
1026         } else {
1027                 rt = NULL;
1028         }
1029         *prt = rt;
1030         return false;
1031 }
1032
1033 /* called with rcu_lock held */
1034 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1035 {
1036         unsigned short flags = fib6_info_dst_flags(rt);
1037         struct net_device *dev = rt->fib6_nh.nh_dev;
1038         struct rt6_info *nrt;
1039
1040         if (!fib6_info_hold_safe(rt))
1041                 return NULL;
1042
1043         nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1044         if (nrt)
1045                 ip6_rt_copy_init(nrt, rt);
1046         else
1047                 fib6_info_release(rt);
1048
1049         return nrt;
1050 }
1051
1052 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1053                                              struct fib6_table *table,
1054                                              struct flowi6 *fl6,
1055                                              const struct sk_buff *skb,
1056                                              int flags)
1057 {
1058         struct fib6_info *f6i;
1059         struct fib6_node *fn;
1060         struct rt6_info *rt;
1061
1062         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1063                 flags &= ~RT6_LOOKUP_F_IFACE;
1064
1065         rcu_read_lock();
1066         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1067 restart:
1068         f6i = rcu_dereference(fn->leaf);
1069         if (!f6i) {
1070                 f6i = net->ipv6.fib6_null_entry;
1071         } else {
1072                 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1073                                       fl6->flowi6_oif, flags);
1074                 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1075                         f6i = fib6_multipath_select(net, f6i, fl6,
1076                                                     fl6->flowi6_oif, skb,
1077                                                     flags);
1078         }
1079         if (f6i == net->ipv6.fib6_null_entry) {
1080                 fn = fib6_backtrack(fn, &fl6->saddr);
1081                 if (fn)
1082                         goto restart;
1083         }
1084
1085         trace_fib6_table_lookup(net, f6i, table, fl6);
1086
1087         /* Search through exception table */
1088         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1089         if (rt) {
1090                 if (ip6_hold_safe(net, &rt, true))
1091                         dst_use_noref(&rt->dst, jiffies);
1092         } else if (f6i == net->ipv6.fib6_null_entry) {
1093                 rt = net->ipv6.ip6_null_entry;
1094                 dst_hold(&rt->dst);
1095         } else {
1096                 rt = ip6_create_rt_rcu(f6i);
1097                 if (!rt) {
1098                         rt = net->ipv6.ip6_null_entry;
1099                         dst_hold(&rt->dst);
1100                 }
1101         }
1102
1103         rcu_read_unlock();
1104
1105         return rt;
1106 }
1107
1108 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1109                                    const struct sk_buff *skb, int flags)
1110 {
1111         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1112 }
1113 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1114
1115 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1116                             const struct in6_addr *saddr, int oif,
1117                             const struct sk_buff *skb, int strict)
1118 {
1119         struct flowi6 fl6 = {
1120                 .flowi6_oif = oif,
1121                 .daddr = *daddr,
1122         };
1123         struct dst_entry *dst;
1124         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1125
1126         if (saddr) {
1127                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1128                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1129         }
1130
1131         dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1132         if (dst->error == 0)
1133                 return (struct rt6_info *) dst;
1134
1135         dst_release(dst);
1136
1137         return NULL;
1138 }
1139 EXPORT_SYMBOL(rt6_lookup);
1140
1141 /* ip6_ins_rt is called with FREE table->tb6_lock.
1142  * It takes new route entry, the addition fails by any reason the
1143  * route is released.
1144  * Caller must hold dst before calling it.
1145  */
1146
1147 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1148                         struct netlink_ext_ack *extack)
1149 {
1150         int err;
1151         struct fib6_table *table;
1152
1153         table = rt->fib6_table;
1154         spin_lock_bh(&table->tb6_lock);
1155         err = fib6_add(&table->tb6_root, rt, info, extack);
1156         spin_unlock_bh(&table->tb6_lock);
1157
1158         return err;
1159 }
1160
1161 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1162 {
1163         struct nl_info info = { .nl_net = net, };
1164
1165         return __ip6_ins_rt(rt, &info, NULL);
1166 }
1167
1168 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1169                                            const struct in6_addr *daddr,
1170                                            const struct in6_addr *saddr)
1171 {
1172         struct net_device *dev;
1173         struct rt6_info *rt;
1174
1175         /*
1176          *      Clone the route.
1177          */
1178
1179         if (!fib6_info_hold_safe(ort))
1180                 return NULL;
1181
1182         dev = ip6_rt_get_dev_rcu(ort);
1183         rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1184         if (!rt) {
1185                 fib6_info_release(ort);
1186                 return NULL;
1187         }
1188
1189         ip6_rt_copy_init(rt, ort);
1190         rt->rt6i_flags |= RTF_CACHE;
1191         rt->dst.flags |= DST_HOST;
1192         rt->rt6i_dst.addr = *daddr;
1193         rt->rt6i_dst.plen = 128;
1194
1195         if (!rt6_is_gw_or_nonexthop(ort)) {
1196                 if (ort->fib6_dst.plen != 128 &&
1197                     ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1198                         rt->rt6i_flags |= RTF_ANYCAST;
1199 #ifdef CONFIG_IPV6_SUBTREES
1200                 if (rt->rt6i_src.plen && saddr) {
1201                         rt->rt6i_src.addr = *saddr;
1202                         rt->rt6i_src.plen = 128;
1203                 }
1204 #endif
1205         }
1206
1207         return rt;
1208 }
1209
1210 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1211 {
1212         unsigned short flags = fib6_info_dst_flags(rt);
1213         struct net_device *dev;
1214         struct rt6_info *pcpu_rt;
1215
1216         if (!fib6_info_hold_safe(rt))
1217                 return NULL;
1218
1219         rcu_read_lock();
1220         dev = ip6_rt_get_dev_rcu(rt);
1221         pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1222         rcu_read_unlock();
1223         if (!pcpu_rt) {
1224                 fib6_info_release(rt);
1225                 return NULL;
1226         }
1227         ip6_rt_copy_init(pcpu_rt, rt);
1228         pcpu_rt->rt6i_flags |= RTF_PCPU;
1229         return pcpu_rt;
1230 }
1231
1232 /* It should be called with rcu_read_lock() acquired */
1233 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1234 {
1235         struct rt6_info *pcpu_rt, **p;
1236
1237         p = this_cpu_ptr(rt->rt6i_pcpu);
1238         pcpu_rt = *p;
1239
1240         if (pcpu_rt)
1241                 ip6_hold_safe(NULL, &pcpu_rt, false);
1242
1243         return pcpu_rt;
1244 }
1245
1246 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1247                                             struct fib6_info *rt)
1248 {
1249         struct rt6_info *pcpu_rt, *prev, **p;
1250
1251         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1252         if (!pcpu_rt) {
1253                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1254                 return net->ipv6.ip6_null_entry;
1255         }
1256
1257         dst_hold(&pcpu_rt->dst);
1258         p = this_cpu_ptr(rt->rt6i_pcpu);
1259         prev = cmpxchg(p, NULL, pcpu_rt);
1260         BUG_ON(prev);
1261
1262         return pcpu_rt;
1263 }
1264
1265 /* exception hash table implementation
1266  */
1267 static DEFINE_SPINLOCK(rt6_exception_lock);
1268
1269 /* Remove rt6_ex from hash table and free the memory
1270  * Caller must hold rt6_exception_lock
1271  */
1272 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1273                                  struct rt6_exception *rt6_ex)
1274 {
1275         struct net *net;
1276
1277         if (!bucket || !rt6_ex)
1278                 return;
1279
1280         net = dev_net(rt6_ex->rt6i->dst.dev);
1281         hlist_del_rcu(&rt6_ex->hlist);
1282         dst_release(&rt6_ex->rt6i->dst);
1283         kfree_rcu(rt6_ex, rcu);
1284         WARN_ON_ONCE(!bucket->depth);
1285         bucket->depth--;
1286         net->ipv6.rt6_stats->fib_rt_cache--;
1287 }
1288
1289 /* Remove oldest rt6_ex in bucket and free the memory
1290  * Caller must hold rt6_exception_lock
1291  */
1292 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1293 {
1294         struct rt6_exception *rt6_ex, *oldest = NULL;
1295
1296         if (!bucket)
1297                 return;
1298
1299         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1300                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1301                         oldest = rt6_ex;
1302         }
1303         rt6_remove_exception(bucket, oldest);
1304 }
1305
1306 static u32 rt6_exception_hash(const struct in6_addr *dst,
1307                               const struct in6_addr *src)
1308 {
1309         static u32 seed __read_mostly;
1310         u32 val;
1311
1312         net_get_random_once(&seed, sizeof(seed));
1313         val = jhash(dst, sizeof(*dst), seed);
1314
1315 #ifdef CONFIG_IPV6_SUBTREES
1316         if (src)
1317                 val = jhash(src, sizeof(*src), val);
1318 #endif
1319         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1320 }
1321
1322 /* Helper function to find the cached rt in the hash table
1323  * and update bucket pointer to point to the bucket for this
1324  * (daddr, saddr) pair
1325  * Caller must hold rt6_exception_lock
1326  */
1327 static struct rt6_exception *
1328 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1329                               const struct in6_addr *daddr,
1330                               const struct in6_addr *saddr)
1331 {
1332         struct rt6_exception *rt6_ex;
1333         u32 hval;
1334
1335         if (!(*bucket) || !daddr)
1336                 return NULL;
1337
1338         hval = rt6_exception_hash(daddr, saddr);
1339         *bucket += hval;
1340
1341         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1342                 struct rt6_info *rt6 = rt6_ex->rt6i;
1343                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1344
1345 #ifdef CONFIG_IPV6_SUBTREES
1346                 if (matched && saddr)
1347                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1348 #endif
1349                 if (matched)
1350                         return rt6_ex;
1351         }
1352         return NULL;
1353 }
1354
1355 /* Helper function to find the cached rt in the hash table
1356  * and update bucket pointer to point to the bucket for this
1357  * (daddr, saddr) pair
1358  * Caller must hold rcu_read_lock()
1359  */
1360 static struct rt6_exception *
1361 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1362                          const struct in6_addr *daddr,
1363                          const struct in6_addr *saddr)
1364 {
1365         struct rt6_exception *rt6_ex;
1366         u32 hval;
1367
1368         WARN_ON_ONCE(!rcu_read_lock_held());
1369
1370         if (!(*bucket) || !daddr)
1371                 return NULL;
1372
1373         hval = rt6_exception_hash(daddr, saddr);
1374         *bucket += hval;
1375
1376         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1377                 struct rt6_info *rt6 = rt6_ex->rt6i;
1378                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1379
1380 #ifdef CONFIG_IPV6_SUBTREES
1381                 if (matched && saddr)
1382                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1383 #endif
1384                 if (matched)
1385                         return rt6_ex;
1386         }
1387         return NULL;
1388 }
1389
1390 static unsigned int fib6_mtu(const struct fib6_info *rt)
1391 {
1392         unsigned int mtu;
1393
1394         if (rt->fib6_pmtu) {
1395                 mtu = rt->fib6_pmtu;
1396         } else {
1397                 struct net_device *dev = fib6_info_nh_dev(rt);
1398                 struct inet6_dev *idev;
1399
1400                 rcu_read_lock();
1401                 idev = __in6_dev_get(dev);
1402                 mtu = idev->cnf.mtu6;
1403                 rcu_read_unlock();
1404         }
1405
1406         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1407
1408         return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1409 }
1410
1411 static int rt6_insert_exception(struct rt6_info *nrt,
1412                                 struct fib6_info *ort)
1413 {
1414         struct net *net = dev_net(nrt->dst.dev);
1415         struct rt6_exception_bucket *bucket;
1416         struct in6_addr *src_key = NULL;
1417         struct rt6_exception *rt6_ex;
1418         int err = 0;
1419
1420         spin_lock_bh(&rt6_exception_lock);
1421
1422         if (ort->exception_bucket_flushed) {
1423                 err = -EINVAL;
1424                 goto out;
1425         }
1426
1427         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1428                                         lockdep_is_held(&rt6_exception_lock));
1429         if (!bucket) {
1430                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1431                                  GFP_ATOMIC);
1432                 if (!bucket) {
1433                         err = -ENOMEM;
1434                         goto out;
1435                 }
1436                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1437         }
1438
1439 #ifdef CONFIG_IPV6_SUBTREES
1440         /* rt6i_src.plen != 0 indicates ort is in subtree
1441          * and exception table is indexed by a hash of
1442          * both rt6i_dst and rt6i_src.
1443          * Otherwise, the exception table is indexed by
1444          * a hash of only rt6i_dst.
1445          */
1446         if (ort->fib6_src.plen)
1447                 src_key = &nrt->rt6i_src.addr;
1448 #endif
1449         /* rt6_mtu_change() might lower mtu on ort.
1450          * Only insert this exception route if its mtu
1451          * is less than ort's mtu value.
1452          */
1453         if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1454                 err = -EINVAL;
1455                 goto out;
1456         }
1457
1458         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1459                                                src_key);
1460         if (rt6_ex)
1461                 rt6_remove_exception(bucket, rt6_ex);
1462
1463         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1464         if (!rt6_ex) {
1465                 err = -ENOMEM;
1466                 goto out;
1467         }
1468         rt6_ex->rt6i = nrt;
1469         rt6_ex->stamp = jiffies;
1470         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1471         bucket->depth++;
1472         net->ipv6.rt6_stats->fib_rt_cache++;
1473
1474         if (bucket->depth > FIB6_MAX_DEPTH)
1475                 rt6_exception_remove_oldest(bucket);
1476
1477 out:
1478         spin_unlock_bh(&rt6_exception_lock);
1479
1480         /* Update fn->fn_sernum to invalidate all cached dst */
1481         if (!err) {
1482                 spin_lock_bh(&ort->fib6_table->tb6_lock);
1483                 fib6_update_sernum(net, ort);
1484                 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1485                 fib6_force_start_gc(net);
1486         }
1487
1488         return err;
1489 }
1490
1491 void rt6_flush_exceptions(struct fib6_info *rt)
1492 {
1493         struct rt6_exception_bucket *bucket;
1494         struct rt6_exception *rt6_ex;
1495         struct hlist_node *tmp;
1496         int i;
1497
1498         spin_lock_bh(&rt6_exception_lock);
1499         /* Prevent rt6_insert_exception() to recreate the bucket list */
1500         rt->exception_bucket_flushed = 1;
1501
1502         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1503                                     lockdep_is_held(&rt6_exception_lock));
1504         if (!bucket)
1505                 goto out;
1506
1507         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1508                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1509                         rt6_remove_exception(bucket, rt6_ex);
1510                 WARN_ON_ONCE(bucket->depth);
1511                 bucket++;
1512         }
1513
1514 out:
1515         spin_unlock_bh(&rt6_exception_lock);
1516 }
1517
1518 /* Find cached rt in the hash table inside passed in rt
1519  * Caller has to hold rcu_read_lock()
1520  */
1521 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1522                                            struct in6_addr *daddr,
1523                                            struct in6_addr *saddr)
1524 {
1525         struct rt6_exception_bucket *bucket;
1526         struct in6_addr *src_key = NULL;
1527         struct rt6_exception *rt6_ex;
1528         struct rt6_info *res = NULL;
1529
1530         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1531
1532 #ifdef CONFIG_IPV6_SUBTREES
1533         /* rt6i_src.plen != 0 indicates rt is in subtree
1534          * and exception table is indexed by a hash of
1535          * both rt6i_dst and rt6i_src.
1536          * Otherwise, the exception table is indexed by
1537          * a hash of only rt6i_dst.
1538          */
1539         if (rt->fib6_src.plen)
1540                 src_key = saddr;
1541 #endif
1542         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1543
1544         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1545                 res = rt6_ex->rt6i;
1546
1547         return res;
1548 }
1549
1550 /* Remove the passed in cached rt from the hash table that contains it */
1551 static int rt6_remove_exception_rt(struct rt6_info *rt)
1552 {
1553         struct rt6_exception_bucket *bucket;
1554         struct in6_addr *src_key = NULL;
1555         struct rt6_exception *rt6_ex;
1556         struct fib6_info *from;
1557         int err;
1558
1559         from = rcu_dereference(rt->from);
1560         if (!from ||
1561             !(rt->rt6i_flags & RTF_CACHE))
1562                 return -EINVAL;
1563
1564         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1565                 return -ENOENT;
1566
1567         spin_lock_bh(&rt6_exception_lock);
1568         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1569                                     lockdep_is_held(&rt6_exception_lock));
1570 #ifdef CONFIG_IPV6_SUBTREES
1571         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1572          * and exception table is indexed by a hash of
1573          * both rt6i_dst and rt6i_src.
1574          * Otherwise, the exception table is indexed by
1575          * a hash of only rt6i_dst.
1576          */
1577         if (from->fib6_src.plen)
1578                 src_key = &rt->rt6i_src.addr;
1579 #endif
1580         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1581                                                &rt->rt6i_dst.addr,
1582                                                src_key);
1583         if (rt6_ex) {
1584                 rt6_remove_exception(bucket, rt6_ex);
1585                 err = 0;
1586         } else {
1587                 err = -ENOENT;
1588         }
1589
1590         spin_unlock_bh(&rt6_exception_lock);
1591         return err;
1592 }
1593
1594 /* Find rt6_ex which contains the passed in rt cache and
1595  * refresh its stamp
1596  */
1597 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1598 {
1599         struct rt6_exception_bucket *bucket;
1600         struct fib6_info *from = rt->from;
1601         struct in6_addr *src_key = NULL;
1602         struct rt6_exception *rt6_ex;
1603
1604         if (!from ||
1605             !(rt->rt6i_flags & RTF_CACHE))
1606                 return;
1607
1608         rcu_read_lock();
1609         bucket = rcu_dereference(from->rt6i_exception_bucket);
1610
1611 #ifdef CONFIG_IPV6_SUBTREES
1612         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1613          * and exception table is indexed by a hash of
1614          * both rt6i_dst and rt6i_src.
1615          * Otherwise, the exception table is indexed by
1616          * a hash of only rt6i_dst.
1617          */
1618         if (from->fib6_src.plen)
1619                 src_key = &rt->rt6i_src.addr;
1620 #endif
1621         rt6_ex = __rt6_find_exception_rcu(&bucket,
1622                                           &rt->rt6i_dst.addr,
1623                                           src_key);
1624         if (rt6_ex)
1625                 rt6_ex->stamp = jiffies;
1626
1627         rcu_read_unlock();
1628 }
1629
1630 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1631                                          struct rt6_info *rt, int mtu)
1632 {
1633         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1634          * lowest MTU in the path: always allow updating the route PMTU to
1635          * reflect PMTU decreases.
1636          *
1637          * If the new MTU is higher, and the route PMTU is equal to the local
1638          * MTU, this means the old MTU is the lowest in the path, so allow
1639          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1640          * handle this.
1641          */
1642
1643         if (dst_mtu(&rt->dst) >= mtu)
1644                 return true;
1645
1646         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1647                 return true;
1648
1649         return false;
1650 }
1651
1652 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1653                                        struct fib6_info *rt, int mtu)
1654 {
1655         struct rt6_exception_bucket *bucket;
1656         struct rt6_exception *rt6_ex;
1657         int i;
1658
1659         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1660                                         lockdep_is_held(&rt6_exception_lock));
1661
1662         if (!bucket)
1663                 return;
1664
1665         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1666                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1667                         struct rt6_info *entry = rt6_ex->rt6i;
1668
1669                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1670                          * route), the metrics of its rt->from have already
1671                          * been updated.
1672                          */
1673                         if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1674                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1675                                 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1676                 }
1677                 bucket++;
1678         }
1679 }
1680
1681 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1682
1683 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1684                                         struct in6_addr *gateway)
1685 {
1686         struct rt6_exception_bucket *bucket;
1687         struct rt6_exception *rt6_ex;
1688         struct hlist_node *tmp;
1689         int i;
1690
1691         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1692                 return;
1693
1694         spin_lock_bh(&rt6_exception_lock);
1695         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1696                                      lockdep_is_held(&rt6_exception_lock));
1697
1698         if (bucket) {
1699                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1700                         hlist_for_each_entry_safe(rt6_ex, tmp,
1701                                                   &bucket->chain, hlist) {
1702                                 struct rt6_info *entry = rt6_ex->rt6i;
1703
1704                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1705                                     RTF_CACHE_GATEWAY &&
1706                                     ipv6_addr_equal(gateway,
1707                                                     &entry->rt6i_gateway)) {
1708                                         rt6_remove_exception(bucket, rt6_ex);
1709                                 }
1710                         }
1711                         bucket++;
1712                 }
1713         }
1714
1715         spin_unlock_bh(&rt6_exception_lock);
1716 }
1717
1718 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1719                                       struct rt6_exception *rt6_ex,
1720                                       struct fib6_gc_args *gc_args,
1721                                       unsigned long now)
1722 {
1723         struct rt6_info *rt = rt6_ex->rt6i;
1724
1725         /* we are pruning and obsoleting aged-out and non gateway exceptions
1726          * even if others have still references to them, so that on next
1727          * dst_check() such references can be dropped.
1728          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1729          * expired, independently from their aging, as per RFC 8201 section 4
1730          */
1731         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1732                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1733                         RT6_TRACE("aging clone %p\n", rt);
1734                         rt6_remove_exception(bucket, rt6_ex);
1735                         return;
1736                 }
1737         } else if (time_after(jiffies, rt->dst.expires)) {
1738                 RT6_TRACE("purging expired route %p\n", rt);
1739                 rt6_remove_exception(bucket, rt6_ex);
1740                 return;
1741         }
1742
1743         if (rt->rt6i_flags & RTF_GATEWAY) {
1744                 struct neighbour *neigh;
1745                 __u8 neigh_flags = 0;
1746
1747                 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1748                 if (neigh)
1749                         neigh_flags = neigh->flags;
1750
1751                 if (!(neigh_flags & NTF_ROUTER)) {
1752                         RT6_TRACE("purging route %p via non-router but gateway\n",
1753                                   rt);
1754                         rt6_remove_exception(bucket, rt6_ex);
1755                         return;
1756                 }
1757         }
1758
1759         gc_args->more++;
1760 }
1761
1762 void rt6_age_exceptions(struct fib6_info *rt,
1763                         struct fib6_gc_args *gc_args,
1764                         unsigned long now)
1765 {
1766         struct rt6_exception_bucket *bucket;
1767         struct rt6_exception *rt6_ex;
1768         struct hlist_node *tmp;
1769         int i;
1770
1771         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1772                 return;
1773
1774         rcu_read_lock_bh();
1775         spin_lock(&rt6_exception_lock);
1776         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1777                                     lockdep_is_held(&rt6_exception_lock));
1778
1779         if (bucket) {
1780                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1781                         hlist_for_each_entry_safe(rt6_ex, tmp,
1782                                                   &bucket->chain, hlist) {
1783                                 rt6_age_examine_exception(bucket, rt6_ex,
1784                                                           gc_args, now);
1785                         }
1786                         bucket++;
1787                 }
1788         }
1789         spin_unlock(&rt6_exception_lock);
1790         rcu_read_unlock_bh();
1791 }
1792
1793 /* must be called with rcu lock held */
1794 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1795                                     int oif, struct flowi6 *fl6, int strict)
1796 {
1797         struct fib6_node *fn, *saved_fn;
1798         struct fib6_info *f6i;
1799
1800         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1801         saved_fn = fn;
1802
1803         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1804                 oif = 0;
1805
1806 redo_rt6_select:
1807         f6i = rt6_select(net, fn, oif, strict);
1808         if (f6i == net->ipv6.fib6_null_entry) {
1809                 fn = fib6_backtrack(fn, &fl6->saddr);
1810                 if (fn)
1811                         goto redo_rt6_select;
1812                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1813                         /* also consider unreachable route */
1814                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1815                         fn = saved_fn;
1816                         goto redo_rt6_select;
1817                 }
1818         }
1819
1820         trace_fib6_table_lookup(net, f6i, table, fl6);
1821
1822         return f6i;
1823 }
1824
1825 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1826                                int oif, struct flowi6 *fl6,
1827                                const struct sk_buff *skb, int flags)
1828 {
1829         struct fib6_info *f6i;
1830         struct rt6_info *rt;
1831         int strict = 0;
1832
1833         strict |= flags & RT6_LOOKUP_F_IFACE;
1834         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1835         if (net->ipv6.devconf_all->forwarding == 0)
1836                 strict |= RT6_LOOKUP_F_REACHABLE;
1837
1838         rcu_read_lock();
1839
1840         f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1841         if (f6i->fib6_nsiblings)
1842                 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1843
1844         if (f6i == net->ipv6.fib6_null_entry) {
1845                 rt = net->ipv6.ip6_null_entry;
1846                 rcu_read_unlock();
1847                 dst_hold(&rt->dst);
1848                 return rt;
1849         }
1850
1851         /*Search through exception table */
1852         rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1853         if (rt) {
1854                 if (ip6_hold_safe(net, &rt, true))
1855                         dst_use_noref(&rt->dst, jiffies);
1856
1857                 rcu_read_unlock();
1858                 return rt;
1859         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1860                             !(f6i->fib6_flags & RTF_GATEWAY))) {
1861                 /* Create a RTF_CACHE clone which will not be
1862                  * owned by the fib6 tree.  It is for the special case where
1863                  * the daddr in the skb during the neighbor look-up is different
1864                  * from the fl6->daddr used to look-up route here.
1865                  */
1866                 struct rt6_info *uncached_rt;
1867
1868                 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1869
1870                 rcu_read_unlock();
1871
1872                 if (uncached_rt) {
1873                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1874                          * No need for another dst_hold()
1875                          */
1876                         rt6_uncached_list_add(uncached_rt);
1877                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1878                 } else {
1879                         uncached_rt = net->ipv6.ip6_null_entry;
1880                         dst_hold(&uncached_rt->dst);
1881                 }
1882
1883                 return uncached_rt;
1884         } else {
1885                 /* Get a percpu copy */
1886
1887                 struct rt6_info *pcpu_rt;
1888
1889                 local_bh_disable();
1890                 pcpu_rt = rt6_get_pcpu_route(f6i);
1891
1892                 if (!pcpu_rt)
1893                         pcpu_rt = rt6_make_pcpu_route(net, f6i);
1894
1895                 local_bh_enable();
1896                 rcu_read_unlock();
1897
1898                 return pcpu_rt;
1899         }
1900 }
1901 EXPORT_SYMBOL_GPL(ip6_pol_route);
1902
1903 static struct rt6_info *ip6_pol_route_input(struct net *net,
1904                                             struct fib6_table *table,
1905                                             struct flowi6 *fl6,
1906                                             const struct sk_buff *skb,
1907                                             int flags)
1908 {
1909         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1910 }
1911
1912 struct dst_entry *ip6_route_input_lookup(struct net *net,
1913                                          struct net_device *dev,
1914                                          struct flowi6 *fl6,
1915                                          const struct sk_buff *skb,
1916                                          int flags)
1917 {
1918         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1919                 flags |= RT6_LOOKUP_F_IFACE;
1920
1921         return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1922 }
1923 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1924
1925 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1926                                   struct flow_keys *keys,
1927                                   struct flow_keys *flkeys)
1928 {
1929         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1930         const struct ipv6hdr *key_iph = outer_iph;
1931         struct flow_keys *_flkeys = flkeys;
1932         const struct ipv6hdr *inner_iph;
1933         const struct icmp6hdr *icmph;
1934         struct ipv6hdr _inner_iph;
1935         struct icmp6hdr _icmph;
1936
1937         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1938                 goto out;
1939
1940         icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1941                                    sizeof(_icmph), &_icmph);
1942         if (!icmph)
1943                 goto out;
1944
1945         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1946             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1947             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1948             icmph->icmp6_type != ICMPV6_PARAMPROB)
1949                 goto out;
1950
1951         inner_iph = skb_header_pointer(skb,
1952                                        skb_transport_offset(skb) + sizeof(*icmph),
1953                                        sizeof(_inner_iph), &_inner_iph);
1954         if (!inner_iph)
1955                 goto out;
1956
1957         key_iph = inner_iph;
1958         _flkeys = NULL;
1959 out:
1960         if (_flkeys) {
1961                 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1962                 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1963                 keys->tags.flow_label = _flkeys->tags.flow_label;
1964                 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1965         } else {
1966                 keys->addrs.v6addrs.src = key_iph->saddr;
1967                 keys->addrs.v6addrs.dst = key_iph->daddr;
1968                 keys->tags.flow_label = ip6_flowlabel(key_iph);
1969                 keys->basic.ip_proto = key_iph->nexthdr;
1970         }
1971 }
1972
1973 /* if skb is set it will be used and fl6 can be NULL */
1974 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1975                        const struct sk_buff *skb, struct flow_keys *flkeys)
1976 {
1977         struct flow_keys hash_keys;
1978         u32 mhash;
1979
1980         switch (ip6_multipath_hash_policy(net)) {
1981         case 0:
1982                 memset(&hash_keys, 0, sizeof(hash_keys));
1983                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1984                 if (skb) {
1985                         ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1986                 } else {
1987                         hash_keys.addrs.v6addrs.src = fl6->saddr;
1988                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
1989                         hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1990                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
1991                 }
1992                 break;
1993         case 1:
1994                 if (skb) {
1995                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1996                         struct flow_keys keys;
1997
1998                         /* short-circuit if we already have L4 hash present */
1999                         if (skb->l4_hash)
2000                                 return skb_get_hash_raw(skb) >> 1;
2001
2002                         memset(&hash_keys, 0, sizeof(hash_keys));
2003
2004                         if (!flkeys) {
2005                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
2006                                 flkeys = &keys;
2007                         }
2008                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2009                         hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2010                         hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2011                         hash_keys.ports.src = flkeys->ports.src;
2012                         hash_keys.ports.dst = flkeys->ports.dst;
2013                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2014                 } else {
2015                         memset(&hash_keys, 0, sizeof(hash_keys));
2016                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2017                         hash_keys.addrs.v6addrs.src = fl6->saddr;
2018                         hash_keys.addrs.v6addrs.dst = fl6->daddr;
2019                         hash_keys.ports.src = fl6->fl6_sport;
2020                         hash_keys.ports.dst = fl6->fl6_dport;
2021                         hash_keys.basic.ip_proto = fl6->flowi6_proto;
2022                 }
2023                 break;
2024         }
2025         mhash = flow_hash_from_keys(&hash_keys);
2026
2027         return mhash >> 1;
2028 }
2029
2030 void ip6_route_input(struct sk_buff *skb)
2031 {
2032         const struct ipv6hdr *iph = ipv6_hdr(skb);
2033         struct net *net = dev_net(skb->dev);
2034         int flags = RT6_LOOKUP_F_HAS_SADDR;
2035         struct ip_tunnel_info *tun_info;
2036         struct flowi6 fl6 = {
2037                 .flowi6_iif = skb->dev->ifindex,
2038                 .daddr = iph->daddr,
2039                 .saddr = iph->saddr,
2040                 .flowlabel = ip6_flowinfo(iph),
2041                 .flowi6_mark = skb->mark,
2042                 .flowi6_proto = iph->nexthdr,
2043         };
2044         struct flow_keys *flkeys = NULL, _flkeys;
2045
2046         tun_info = skb_tunnel_info(skb);
2047         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2048                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2049
2050         if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2051                 flkeys = &_flkeys;
2052
2053         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2054                 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2055         skb_dst_drop(skb);
2056         skb_dst_set(skb,
2057                     ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2058 }
2059
2060 static struct rt6_info *ip6_pol_route_output(struct net *net,
2061                                              struct fib6_table *table,
2062                                              struct flowi6 *fl6,
2063                                              const struct sk_buff *skb,
2064                                              int flags)
2065 {
2066         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2067 }
2068
2069 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2070                                          struct flowi6 *fl6, int flags)
2071 {
2072         bool any_src;
2073
2074         if (ipv6_addr_type(&fl6->daddr) &
2075             (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2076                 struct dst_entry *dst;
2077
2078                 dst = l3mdev_link_scope_lookup(net, fl6);
2079                 if (dst)
2080                         return dst;
2081         }
2082
2083         fl6->flowi6_iif = LOOPBACK_IFINDEX;
2084
2085         any_src = ipv6_addr_any(&fl6->saddr);
2086         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2087             (fl6->flowi6_oif && any_src))
2088                 flags |= RT6_LOOKUP_F_IFACE;
2089
2090         if (!any_src)
2091                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2092         else if (sk)
2093                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2094
2095         return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2096 }
2097 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2098
2099 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2100 {
2101         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2102         struct net_device *loopback_dev = net->loopback_dev;
2103         struct dst_entry *new = NULL;
2104
2105         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2106                        DST_OBSOLETE_DEAD, 0);
2107         if (rt) {
2108                 rt6_info_init(rt);
2109                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2110
2111                 new = &rt->dst;
2112                 new->__use = 1;
2113                 new->input = dst_discard;
2114                 new->output = dst_discard_out;
2115
2116                 dst_copy_metrics(new, &ort->dst);
2117
2118                 rt->rt6i_idev = in6_dev_get(loopback_dev);
2119                 rt->rt6i_gateway = ort->rt6i_gateway;
2120                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2121
2122                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2123 #ifdef CONFIG_IPV6_SUBTREES
2124                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2125 #endif
2126         }
2127
2128         dst_release(dst_orig);
2129         return new ? new : ERR_PTR(-ENOMEM);
2130 }
2131
2132 /*
2133  *      Destination cache support functions
2134  */
2135
2136 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2137 {
2138         u32 rt_cookie = 0;
2139
2140         if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2141                 return false;
2142
2143         if (fib6_check_expired(f6i))
2144                 return false;
2145
2146         return true;
2147 }
2148
2149 static struct dst_entry *rt6_check(struct rt6_info *rt,
2150                                    struct fib6_info *from,
2151                                    u32 cookie)
2152 {
2153         u32 rt_cookie = 0;
2154
2155         if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2156             rt_cookie != cookie)
2157                 return NULL;
2158
2159         if (rt6_check_expired(rt))
2160                 return NULL;
2161
2162         return &rt->dst;
2163 }
2164
2165 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2166                                             struct fib6_info *from,
2167                                             u32 cookie)
2168 {
2169         if (!__rt6_check_expired(rt) &&
2170             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2171             fib6_check(from, cookie))
2172                 return &rt->dst;
2173         else
2174                 return NULL;
2175 }
2176
2177 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2178 {
2179         struct dst_entry *dst_ret;
2180         struct fib6_info *from;
2181         struct rt6_info *rt;
2182
2183         rt = container_of(dst, struct rt6_info, dst);
2184
2185         rcu_read_lock();
2186
2187         /* All IPV6 dsts are created with ->obsolete set to the value
2188          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2189          * into this function always.
2190          */
2191
2192         from = rcu_dereference(rt->from);
2193
2194         if (from && (rt->rt6i_flags & RTF_PCPU ||
2195             unlikely(!list_empty(&rt->rt6i_uncached))))
2196                 dst_ret = rt6_dst_from_check(rt, from, cookie);
2197         else
2198                 dst_ret = rt6_check(rt, from, cookie);
2199
2200         rcu_read_unlock();
2201
2202         return dst_ret;
2203 }
2204
2205 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2206 {
2207         struct rt6_info *rt = (struct rt6_info *) dst;
2208
2209         if (rt) {
2210                 if (rt->rt6i_flags & RTF_CACHE) {
2211                         rcu_read_lock();
2212                         if (rt6_check_expired(rt)) {
2213                                 rt6_remove_exception_rt(rt);
2214                                 dst = NULL;
2215                         }
2216                         rcu_read_unlock();
2217                 } else {
2218                         dst_release(dst);
2219                         dst = NULL;
2220                 }
2221         }
2222         return dst;
2223 }
2224
2225 static void ip6_link_failure(struct sk_buff *skb)
2226 {
2227         struct rt6_info *rt;
2228
2229         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2230
2231         rt = (struct rt6_info *) skb_dst(skb);
2232         if (rt) {
2233                 rcu_read_lock();
2234                 if (rt->rt6i_flags & RTF_CACHE) {
2235                         if (dst_hold_safe(&rt->dst))
2236                                 rt6_remove_exception_rt(rt);
2237                 } else {
2238                         struct fib6_info *from;
2239                         struct fib6_node *fn;
2240
2241                         from = rcu_dereference(rt->from);
2242                         if (from) {
2243                                 fn = rcu_dereference(from->fib6_node);
2244                                 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2245                                         fn->fn_sernum = -1;
2246                         }
2247                 }
2248                 rcu_read_unlock();
2249         }
2250 }
2251
2252 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2253 {
2254         if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2255                 struct fib6_info *from;
2256
2257                 rcu_read_lock();
2258                 from = rcu_dereference(rt0->from);
2259                 if (from)
2260                         rt0->dst.expires = from->expires;
2261                 rcu_read_unlock();
2262         }
2263
2264         dst_set_expires(&rt0->dst, timeout);
2265         rt0->rt6i_flags |= RTF_EXPIRES;
2266 }
2267
2268 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2269 {
2270         struct net *net = dev_net(rt->dst.dev);
2271
2272         dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2273         rt->rt6i_flags |= RTF_MODIFIED;
2274         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2275 }
2276
2277 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2278 {
2279         bool from_set;
2280
2281         rcu_read_lock();
2282         from_set = !!rcu_dereference(rt->from);
2283         rcu_read_unlock();
2284
2285         return !(rt->rt6i_flags & RTF_CACHE) &&
2286                 (rt->rt6i_flags & RTF_PCPU || from_set);
2287 }
2288
2289 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2290                                  const struct ipv6hdr *iph, u32 mtu)
2291 {
2292         const struct in6_addr *daddr, *saddr;
2293         struct rt6_info *rt6 = (struct rt6_info *)dst;
2294
2295         if (dst_metric_locked(dst, RTAX_MTU))
2296                 return;
2297
2298         if (iph) {
2299                 daddr = &iph->daddr;
2300                 saddr = &iph->saddr;
2301         } else if (sk) {
2302                 daddr = &sk->sk_v6_daddr;
2303                 saddr = &inet6_sk(sk)->saddr;
2304         } else {
2305                 daddr = NULL;
2306                 saddr = NULL;
2307         }
2308         dst_confirm_neigh(dst, daddr);
2309         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2310         if (mtu >= dst_mtu(dst))
2311                 return;
2312
2313         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2314                 rt6_do_update_pmtu(rt6, mtu);
2315                 /* update rt6_ex->stamp for cache */
2316                 if (rt6->rt6i_flags & RTF_CACHE)
2317                         rt6_update_exception_stamp_rt(rt6);
2318         } else if (daddr) {
2319                 struct fib6_info *from;
2320                 struct rt6_info *nrt6;
2321
2322                 rcu_read_lock();
2323                 from = rcu_dereference(rt6->from);
2324                 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2325                 if (nrt6) {
2326                         rt6_do_update_pmtu(nrt6, mtu);
2327                         if (rt6_insert_exception(nrt6, from))
2328                                 dst_release_immediate(&nrt6->dst);
2329                 }
2330                 rcu_read_unlock();
2331         }
2332 }
2333
2334 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2335                                struct sk_buff *skb, u32 mtu)
2336 {
2337         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2338 }
2339
2340 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2341                      int oif, u32 mark, kuid_t uid)
2342 {
2343         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2344         struct dst_entry *dst;
2345         struct flowi6 fl6 = {
2346                 .flowi6_oif = oif,
2347                 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2348                 .daddr = iph->daddr,
2349                 .saddr = iph->saddr,
2350                 .flowlabel = ip6_flowinfo(iph),
2351                 .flowi6_uid = uid,
2352         };
2353
2354         dst = ip6_route_output(net, NULL, &fl6);
2355         if (!dst->error)
2356                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2357         dst_release(dst);
2358 }
2359 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2360
2361 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2362 {
2363         struct dst_entry *dst;
2364
2365         ip6_update_pmtu(skb, sock_net(sk), mtu,
2366                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2367
2368         dst = __sk_dst_get(sk);
2369         if (!dst || !dst->obsolete ||
2370             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2371                 return;
2372
2373         bh_lock_sock(sk);
2374         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2375                 ip6_datagram_dst_update(sk, false);
2376         bh_unlock_sock(sk);
2377 }
2378 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2379
2380 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2381                            const struct flowi6 *fl6)
2382 {
2383 #ifdef CONFIG_IPV6_SUBTREES
2384         struct ipv6_pinfo *np = inet6_sk(sk);
2385 #endif
2386
2387         ip6_dst_store(sk, dst,
2388                       ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2389                       &sk->sk_v6_daddr : NULL,
2390 #ifdef CONFIG_IPV6_SUBTREES
2391                       ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2392                       &np->saddr :
2393 #endif
2394                       NULL);
2395 }
2396
2397 /* Handle redirects */
2398 struct ip6rd_flowi {
2399         struct flowi6 fl6;
2400         struct in6_addr gateway;
2401 };
2402
2403 static struct rt6_info *__ip6_route_redirect(struct net *net,
2404                                              struct fib6_table *table,
2405                                              struct flowi6 *fl6,
2406                                              const struct sk_buff *skb,
2407                                              int flags)
2408 {
2409         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2410         struct rt6_info *ret = NULL, *rt_cache;
2411         struct fib6_info *rt;
2412         struct fib6_node *fn;
2413
2414         /* Get the "current" route for this destination and
2415          * check if the redirect has come from appropriate router.
2416          *
2417          * RFC 4861 specifies that redirects should only be
2418          * accepted if they come from the nexthop to the target.
2419          * Due to the way the routes are chosen, this notion
2420          * is a bit fuzzy and one might need to check all possible
2421          * routes.
2422          */
2423
2424         rcu_read_lock();
2425         fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2426 restart:
2427         for_each_fib6_node_rt_rcu(fn) {
2428                 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2429                         continue;
2430                 if (fib6_check_expired(rt))
2431                         continue;
2432                 if (rt->fib6_flags & RTF_REJECT)
2433                         break;
2434                 if (!(rt->fib6_flags & RTF_GATEWAY))
2435                         continue;
2436                 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2437                         continue;
2438                 /* rt_cache's gateway might be different from its 'parent'
2439                  * in the case of an ip redirect.
2440                  * So we keep searching in the exception table if the gateway
2441                  * is different.
2442                  */
2443                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2444                         rt_cache = rt6_find_cached_rt(rt,
2445                                                       &fl6->daddr,
2446                                                       &fl6->saddr);
2447                         if (rt_cache &&
2448                             ipv6_addr_equal(&rdfl->gateway,
2449                                             &rt_cache->rt6i_gateway)) {
2450                                 ret = rt_cache;
2451                                 break;
2452                         }
2453                         continue;
2454                 }
2455                 break;
2456         }
2457
2458         if (!rt)
2459                 rt = net->ipv6.fib6_null_entry;
2460         else if (rt->fib6_flags & RTF_REJECT) {
2461                 ret = net->ipv6.ip6_null_entry;
2462                 goto out;
2463         }
2464
2465         if (rt == net->ipv6.fib6_null_entry) {
2466                 fn = fib6_backtrack(fn, &fl6->saddr);
2467                 if (fn)
2468                         goto restart;
2469         }
2470
2471 out:
2472         if (ret)
2473                 ip6_hold_safe(net, &ret, true);
2474         else
2475                 ret = ip6_create_rt_rcu(rt);
2476
2477         rcu_read_unlock();
2478
2479         trace_fib6_table_lookup(net, rt, table, fl6);
2480         return ret;
2481 };
2482
2483 static struct dst_entry *ip6_route_redirect(struct net *net,
2484                                             const struct flowi6 *fl6,
2485                                             const struct sk_buff *skb,
2486                                             const struct in6_addr *gateway)
2487 {
2488         int flags = RT6_LOOKUP_F_HAS_SADDR;
2489         struct ip6rd_flowi rdfl;
2490
2491         rdfl.fl6 = *fl6;
2492         rdfl.gateway = *gateway;
2493
2494         return fib6_rule_lookup(net, &rdfl.fl6, skb,
2495                                 flags, __ip6_route_redirect);
2496 }
2497
2498 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2499                   kuid_t uid)
2500 {
2501         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2502         struct dst_entry *dst;
2503         struct flowi6 fl6 = {
2504                 .flowi6_iif = LOOPBACK_IFINDEX,
2505                 .flowi6_oif = oif,
2506                 .flowi6_mark = mark,
2507                 .daddr = iph->daddr,
2508                 .saddr = iph->saddr,
2509                 .flowlabel = ip6_flowinfo(iph),
2510                 .flowi6_uid = uid,
2511         };
2512
2513         dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2514         rt6_do_redirect(dst, NULL, skb);
2515         dst_release(dst);
2516 }
2517 EXPORT_SYMBOL_GPL(ip6_redirect);
2518
2519 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2520 {
2521         const struct ipv6hdr *iph = ipv6_hdr(skb);
2522         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2523         struct dst_entry *dst;
2524         struct flowi6 fl6 = {
2525                 .flowi6_iif = LOOPBACK_IFINDEX,
2526                 .flowi6_oif = oif,
2527                 .daddr = msg->dest,
2528                 .saddr = iph->daddr,
2529                 .flowi6_uid = sock_net_uid(net, NULL),
2530         };
2531
2532         dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2533         rt6_do_redirect(dst, NULL, skb);
2534         dst_release(dst);
2535 }
2536
2537 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2538 {
2539         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2540                      sk->sk_uid);
2541 }
2542 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2543
2544 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2545 {
2546         struct net_device *dev = dst->dev;
2547         unsigned int mtu = dst_mtu(dst);
2548         struct net *net = dev_net(dev);
2549
2550         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2551
2552         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2553                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2554
2555         /*
2556          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2557          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2558          * IPV6_MAXPLEN is also valid and means: "any MSS,
2559          * rely only on pmtu discovery"
2560          */
2561         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2562                 mtu = IPV6_MAXPLEN;
2563         return mtu;
2564 }
2565
2566 static unsigned int ip6_mtu(const struct dst_entry *dst)
2567 {
2568         struct inet6_dev *idev;
2569         unsigned int mtu;
2570
2571         mtu = dst_metric_raw(dst, RTAX_MTU);
2572         if (mtu)
2573                 goto out;
2574
2575         mtu = IPV6_MIN_MTU;
2576
2577         rcu_read_lock();
2578         idev = __in6_dev_get(dst->dev);
2579         if (idev)
2580                 mtu = idev->cnf.mtu6;
2581         rcu_read_unlock();
2582
2583 out:
2584         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2585
2586         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2587 }
2588
2589 /* MTU selection:
2590  * 1. mtu on route is locked - use it
2591  * 2. mtu from nexthop exception
2592  * 3. mtu from egress device
2593  *
2594  * based on ip6_dst_mtu_forward and exception logic of
2595  * rt6_find_cached_rt; called with rcu_read_lock
2596  */
2597 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2598                       struct in6_addr *saddr)
2599 {
2600         struct rt6_exception_bucket *bucket;
2601         struct rt6_exception *rt6_ex;
2602         struct in6_addr *src_key;
2603         struct inet6_dev *idev;
2604         u32 mtu = 0;
2605
2606         if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2607                 mtu = f6i->fib6_pmtu;
2608                 if (mtu)
2609                         goto out;
2610         }
2611
2612         src_key = NULL;
2613 #ifdef CONFIG_IPV6_SUBTREES
2614         if (f6i->fib6_src.plen)
2615                 src_key = saddr;
2616 #endif
2617
2618         bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2619         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2620         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2621                 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2622
2623         if (likely(!mtu)) {
2624                 struct net_device *dev = fib6_info_nh_dev(f6i);
2625
2626                 mtu = IPV6_MIN_MTU;
2627                 idev = __in6_dev_get(dev);
2628                 if (idev && idev->cnf.mtu6 > mtu)
2629                         mtu = idev->cnf.mtu6;
2630         }
2631
2632         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2633 out:
2634         return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2635 }
2636
2637 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2638                                   struct flowi6 *fl6)
2639 {
2640         struct dst_entry *dst;
2641         struct rt6_info *rt;
2642         struct inet6_dev *idev = in6_dev_get(dev);
2643         struct net *net = dev_net(dev);
2644
2645         if (unlikely(!idev))
2646                 return ERR_PTR(-ENODEV);
2647
2648         rt = ip6_dst_alloc(net, dev, 0);
2649         if (unlikely(!rt)) {
2650                 in6_dev_put(idev);
2651                 dst = ERR_PTR(-ENOMEM);
2652                 goto out;
2653         }
2654
2655         rt->dst.flags |= DST_HOST;
2656         rt->dst.input = ip6_input;
2657         rt->dst.output  = ip6_output;
2658         rt->rt6i_gateway  = fl6->daddr;
2659         rt->rt6i_dst.addr = fl6->daddr;
2660         rt->rt6i_dst.plen = 128;
2661         rt->rt6i_idev     = idev;
2662         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2663
2664         /* Add this dst into uncached_list so that rt6_disable_ip() can
2665          * do proper release of the net_device
2666          */
2667         rt6_uncached_list_add(rt);
2668         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2669
2670         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2671
2672 out:
2673         return dst;
2674 }
2675
2676 static int ip6_dst_gc(struct dst_ops *ops)
2677 {
2678         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2679         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2680         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2681         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2682         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2683         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2684         int entries;
2685
2686         entries = dst_entries_get_fast(ops);
2687         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2688             entries <= rt_max_size)
2689                 goto out;
2690
2691         net->ipv6.ip6_rt_gc_expire++;
2692         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2693         entries = dst_entries_get_slow(ops);
2694         if (entries < ops->gc_thresh)
2695                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2696 out:
2697         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2698         return entries > rt_max_size;
2699 }
2700
2701 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2702                                             struct fib6_config *cfg,
2703                                             const struct in6_addr *gw_addr,
2704                                             u32 tbid, int flags)
2705 {
2706         struct flowi6 fl6 = {
2707                 .flowi6_oif = cfg->fc_ifindex,
2708                 .daddr = *gw_addr,
2709                 .saddr = cfg->fc_prefsrc,
2710         };
2711         struct fib6_table *table;
2712         struct rt6_info *rt;
2713
2714         table = fib6_get_table(net, tbid);
2715         if (!table)
2716                 return NULL;
2717
2718         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2719                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2720
2721         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2722         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2723
2724         /* if table lookup failed, fall back to full lookup */
2725         if (rt == net->ipv6.ip6_null_entry) {
2726                 ip6_rt_put(rt);
2727                 rt = NULL;
2728         }
2729
2730         return rt;
2731 }
2732
2733 static int ip6_route_check_nh_onlink(struct net *net,
2734                                      struct fib6_config *cfg,
2735                                      const struct net_device *dev,
2736                                      struct netlink_ext_ack *extack)
2737 {
2738         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2739         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2740         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2741         struct rt6_info *grt;
2742         int err;
2743
2744         err = 0;
2745         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2746         if (grt) {
2747                 if (!grt->dst.error &&
2748                     /* ignore match if it is the default route */
2749                     grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) &&
2750                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2751                         NL_SET_ERR_MSG(extack,
2752                                        "Nexthop has invalid gateway or device mismatch");
2753                         err = -EINVAL;
2754                 }
2755
2756                 ip6_rt_put(grt);
2757         }
2758
2759         return err;
2760 }
2761
2762 static int ip6_route_check_nh(struct net *net,
2763                               struct fib6_config *cfg,
2764                               struct net_device **_dev,
2765                               struct inet6_dev **idev)
2766 {
2767         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2768         struct net_device *dev = _dev ? *_dev : NULL;
2769         struct rt6_info *grt = NULL;
2770         int err = -EHOSTUNREACH;
2771
2772         if (cfg->fc_table) {
2773                 int flags = RT6_LOOKUP_F_IFACE;
2774
2775                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2776                                           cfg->fc_table, flags);
2777                 if (grt) {
2778                         if (grt->rt6i_flags & RTF_GATEWAY ||
2779                             (dev && dev != grt->dst.dev)) {
2780                                 ip6_rt_put(grt);
2781                                 grt = NULL;
2782                         }
2783                 }
2784         }
2785
2786         if (!grt)
2787                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2788
2789         if (!grt)
2790                 goto out;
2791
2792         if (dev) {
2793                 if (dev != grt->dst.dev) {
2794                         ip6_rt_put(grt);
2795                         goto out;
2796                 }
2797         } else {
2798                 *_dev = dev = grt->dst.dev;
2799                 *idev = grt->rt6i_idev;
2800                 dev_hold(dev);
2801                 in6_dev_hold(grt->rt6i_idev);
2802         }
2803
2804         if (!(grt->rt6i_flags & RTF_GATEWAY))
2805                 err = 0;
2806
2807         ip6_rt_put(grt);
2808
2809 out:
2810         return err;
2811 }
2812
2813 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2814                            struct net_device **_dev, struct inet6_dev **idev,
2815                            struct netlink_ext_ack *extack)
2816 {
2817         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2818         int gwa_type = ipv6_addr_type(gw_addr);
2819         bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2820         const struct net_device *dev = *_dev;
2821         bool need_addr_check = !dev;
2822         int err = -EINVAL;
2823
2824         /* if gw_addr is local we will fail to detect this in case
2825          * address is still TENTATIVE (DAD in progress). rt6_lookup()
2826          * will return already-added prefix route via interface that
2827          * prefix route was assigned to, which might be non-loopback.
2828          */
2829         if (dev &&
2830             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2831                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2832                 goto out;
2833         }
2834
2835         if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2836                 /* IPv6 strictly inhibits using not link-local
2837                  * addresses as nexthop address.
2838                  * Otherwise, router will not able to send redirects.
2839                  * It is very good, but in some (rare!) circumstances
2840                  * (SIT, PtP, NBMA NOARP links) it is handy to allow
2841                  * some exceptions. --ANK
2842                  * We allow IPv4-mapped nexthops to support RFC4798-type
2843                  * addressing
2844                  */
2845                 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2846                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2847                         goto out;
2848                 }
2849
2850                 if (cfg->fc_flags & RTNH_F_ONLINK)
2851                         err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2852                 else
2853                         err = ip6_route_check_nh(net, cfg, _dev, idev);
2854
2855                 if (err)
2856                         goto out;
2857         }
2858
2859         /* reload in case device was changed */
2860         dev = *_dev;
2861
2862         err = -EINVAL;
2863         if (!dev) {
2864                 NL_SET_ERR_MSG(extack, "Egress device not specified");
2865                 goto out;
2866         } else if (dev->flags & IFF_LOOPBACK) {
2867                 NL_SET_ERR_MSG(extack,
2868                                "Egress device can not be loopback device for this route");
2869                 goto out;
2870         }
2871
2872         /* if we did not check gw_addr above, do so now that the
2873          * egress device has been resolved.
2874          */
2875         if (need_addr_check &&
2876             ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2877                 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2878                 goto out;
2879         }
2880
2881         err = 0;
2882 out:
2883         return err;
2884 }
2885
2886 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2887                                               gfp_t gfp_flags,
2888                                               struct netlink_ext_ack *extack)
2889 {
2890         struct net *net = cfg->fc_nlinfo.nl_net;
2891         struct fib6_info *rt = NULL;
2892         struct net_device *dev = NULL;
2893         struct inet6_dev *idev = NULL;
2894         struct fib6_table *table;
2895         int addr_type;
2896         int err = -EINVAL;
2897
2898         /* RTF_PCPU is an internal flag; can not be set by userspace */
2899         if (cfg->fc_flags & RTF_PCPU) {
2900                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2901                 goto out;
2902         }
2903
2904         /* RTF_CACHE is an internal flag; can not be set by userspace */
2905         if (cfg->fc_flags & RTF_CACHE) {
2906                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2907                 goto out;
2908         }
2909
2910         if (cfg->fc_type > RTN_MAX) {
2911                 NL_SET_ERR_MSG(extack, "Invalid route type");
2912                 goto out;
2913         }
2914
2915         if (cfg->fc_dst_len > 128) {
2916                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2917                 goto out;
2918         }
2919         if (cfg->fc_src_len > 128) {
2920                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2921                 goto out;
2922         }
2923 #ifndef CONFIG_IPV6_SUBTREES
2924         if (cfg->fc_src_len) {
2925                 NL_SET_ERR_MSG(extack,
2926                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2927                 goto out;
2928         }
2929 #endif
2930         if (cfg->fc_ifindex) {
2931                 err = -ENODEV;
2932                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2933                 if (!dev)
2934                         goto out;
2935                 idev = in6_dev_get(dev);
2936                 if (!idev)
2937                         goto out;
2938         }
2939
2940         if (cfg->fc_metric == 0)
2941                 cfg->fc_metric = IP6_RT_PRIO_USER;
2942
2943         if (cfg->fc_flags & RTNH_F_ONLINK) {
2944                 if (!dev) {
2945                         NL_SET_ERR_MSG(extack,
2946                                        "Nexthop device required for onlink");
2947                         err = -ENODEV;
2948                         goto out;
2949                 }
2950
2951                 if (!(dev->flags & IFF_UP)) {
2952                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2953                         err = -ENETDOWN;
2954                         goto out;
2955                 }
2956         }
2957
2958         err = -ENOBUFS;
2959         if (cfg->fc_nlinfo.nlh &&
2960             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2961                 table = fib6_get_table(net, cfg->fc_table);
2962                 if (!table) {
2963                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2964                         table = fib6_new_table(net, cfg->fc_table);
2965                 }
2966         } else {
2967                 table = fib6_new_table(net, cfg->fc_table);
2968         }
2969
2970         if (!table)
2971                 goto out;
2972
2973         err = -ENOMEM;
2974         rt = fib6_info_alloc(gfp_flags);
2975         if (!rt)
2976                 goto out;
2977
2978         rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2979                                                extack);
2980         if (IS_ERR(rt->fib6_metrics)) {
2981                 err = PTR_ERR(rt->fib6_metrics);
2982                 /* Do not leave garbage there. */
2983                 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2984                 goto out;
2985         }
2986
2987         if (cfg->fc_flags & RTF_ADDRCONF)
2988                 rt->dst_nocount = true;
2989
2990         if (cfg->fc_flags & RTF_EXPIRES)
2991                 fib6_set_expires(rt, jiffies +
2992                                 clock_t_to_jiffies(cfg->fc_expires));
2993         else
2994                 fib6_clean_expires(rt);
2995
2996         if (cfg->fc_protocol == RTPROT_UNSPEC)
2997                 cfg->fc_protocol = RTPROT_BOOT;
2998         rt->fib6_protocol = cfg->fc_protocol;
2999
3000         addr_type = ipv6_addr_type(&cfg->fc_dst);
3001
3002         if (cfg->fc_encap) {
3003                 struct lwtunnel_state *lwtstate;
3004
3005                 err = lwtunnel_build_state(cfg->fc_encap_type,
3006                                            cfg->fc_encap, AF_INET6, cfg,
3007                                            &lwtstate, extack);
3008                 if (err)
3009                         goto out;
3010                 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3011         }
3012
3013         ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3014         rt->fib6_dst.plen = cfg->fc_dst_len;
3015         if (rt->fib6_dst.plen == 128)
3016                 rt->dst_host = true;
3017
3018 #ifdef CONFIG_IPV6_SUBTREES
3019         ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3020         rt->fib6_src.plen = cfg->fc_src_len;
3021 #endif
3022
3023         rt->fib6_metric = cfg->fc_metric;
3024         rt->fib6_nh.nh_weight = 1;
3025
3026         rt->fib6_type = cfg->fc_type;
3027
3028         /* We cannot add true routes via loopback here,
3029            they would result in kernel looping; promote them to reject routes
3030          */
3031         if ((cfg->fc_flags & RTF_REJECT) ||
3032             (dev && (dev->flags & IFF_LOOPBACK) &&
3033              !(addr_type & IPV6_ADDR_LOOPBACK) &&
3034              !(cfg->fc_flags & RTF_LOCAL))) {
3035                 /* hold loopback dev/idev if we haven't done so. */
3036                 if (dev != net->loopback_dev) {
3037                         if (dev) {
3038                                 dev_put(dev);
3039                                 in6_dev_put(idev);
3040                         }
3041                         dev = net->loopback_dev;
3042                         dev_hold(dev);
3043                         idev = in6_dev_get(dev);
3044                         if (!idev) {
3045                                 err = -ENODEV;
3046                                 goto out;
3047                         }
3048                 }
3049                 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3050                 goto install_route;
3051         }
3052
3053         if (cfg->fc_flags & RTF_GATEWAY) {
3054                 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3055                 if (err)
3056                         goto out;
3057
3058                 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3059         }
3060
3061         err = -ENODEV;
3062         if (!dev)
3063                 goto out;
3064
3065         if (idev->cnf.disable_ipv6) {
3066                 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3067                 err = -EACCES;
3068                 goto out;
3069         }
3070
3071         if (!(dev->flags & IFF_UP)) {
3072                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3073                 err = -ENETDOWN;
3074                 goto out;
3075         }
3076
3077         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3078                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3079                         NL_SET_ERR_MSG(extack, "Invalid source address");
3080                         err = -EINVAL;
3081                         goto out;
3082                 }
3083                 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3084                 rt->fib6_prefsrc.plen = 128;
3085         } else
3086                 rt->fib6_prefsrc.plen = 0;
3087
3088         rt->fib6_flags = cfg->fc_flags;
3089
3090 install_route:
3091         if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3092             !netif_carrier_ok(dev))
3093                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3094         rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3095         rt->fib6_nh.nh_dev = dev;
3096         rt->fib6_table = table;
3097
3098         if (idev)
3099                 in6_dev_put(idev);
3100
3101         return rt;
3102 out:
3103         if (dev)
3104                 dev_put(dev);
3105         if (idev)
3106                 in6_dev_put(idev);
3107
3108         fib6_info_release(rt);
3109         return ERR_PTR(err);
3110 }
3111
3112 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3113                   struct netlink_ext_ack *extack)
3114 {
3115         struct fib6_info *rt;
3116         int err;
3117
3118         rt = ip6_route_info_create(cfg, gfp_flags, extack);
3119         if (IS_ERR(rt))
3120                 return PTR_ERR(rt);
3121
3122         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3123         fib6_info_release(rt);
3124
3125         return err;
3126 }
3127
3128 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3129 {
3130         struct net *net = info->nl_net;
3131         struct fib6_table *table;
3132         int err;
3133
3134         if (rt == net->ipv6.fib6_null_entry) {
3135                 err = -ENOENT;
3136                 goto out;
3137         }
3138
3139         table = rt->fib6_table;
3140         spin_lock_bh(&table->tb6_lock);
3141         err = fib6_del(rt, info);
3142         spin_unlock_bh(&table->tb6_lock);
3143
3144 out:
3145         fib6_info_release(rt);
3146         return err;
3147 }
3148
3149 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3150 {
3151         struct nl_info info = { .nl_net = net };
3152
3153         return __ip6_del_rt(rt, &info);
3154 }
3155
3156 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3157 {
3158         struct nl_info *info = &cfg->fc_nlinfo;
3159         struct net *net = info->nl_net;
3160         struct sk_buff *skb = NULL;
3161         struct fib6_table *table;
3162         int err = -ENOENT;
3163
3164         if (rt == net->ipv6.fib6_null_entry)
3165                 goto out_put;
3166         table = rt->fib6_table;
3167         spin_lock_bh(&table->tb6_lock);
3168
3169         if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3170                 struct fib6_info *sibling, *next_sibling;
3171
3172                 /* prefer to send a single notification with all hops */
3173                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3174                 if (skb) {
3175                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3176
3177                         if (rt6_fill_node(net, skb, rt, NULL,
3178                                           NULL, NULL, 0, RTM_DELROUTE,
3179                                           info->portid, seq, 0) < 0) {
3180                                 kfree_skb(skb);
3181                                 skb = NULL;
3182                         } else
3183                                 info->skip_notify = 1;
3184                 }
3185
3186                 list_for_each_entry_safe(sibling, next_sibling,
3187                                          &rt->fib6_siblings,
3188                                          fib6_siblings) {
3189                         err = fib6_del(sibling, info);
3190                         if (err)
3191                                 goto out_unlock;
3192                 }
3193         }
3194
3195         err = fib6_del(rt, info);
3196 out_unlock:
3197         spin_unlock_bh(&table->tb6_lock);
3198 out_put:
3199         fib6_info_release(rt);
3200
3201         if (skb) {
3202                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3203                             info->nlh, gfp_any());
3204         }
3205         return err;
3206 }
3207
3208 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3209 {
3210         int rc = -ESRCH;
3211
3212         if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3213                 goto out;
3214
3215         if (cfg->fc_flags & RTF_GATEWAY &&
3216             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3217                 goto out;
3218         if (dst_hold_safe(&rt->dst))
3219                 rc = rt6_remove_exception_rt(rt);
3220 out:
3221         return rc;
3222 }
3223
3224 static int ip6_route_del(struct fib6_config *cfg,
3225                          struct netlink_ext_ack *extack)
3226 {
3227         struct rt6_info *rt_cache;
3228         struct fib6_table *table;
3229         struct fib6_info *rt;
3230         struct fib6_node *fn;
3231         int err = -ESRCH;
3232
3233         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3234         if (!table) {
3235                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3236                 return err;
3237         }
3238
3239         rcu_read_lock();
3240
3241         fn = fib6_locate(&table->tb6_root,
3242                          &cfg->fc_dst, cfg->fc_dst_len,
3243                          &cfg->fc_src, cfg->fc_src_len,
3244                          !(cfg->fc_flags & RTF_CACHE));
3245
3246         if (fn) {
3247                 for_each_fib6_node_rt_rcu(fn) {
3248                         if (cfg->fc_flags & RTF_CACHE) {
3249                                 int rc;
3250
3251                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3252                                                               &cfg->fc_src);
3253                                 if (rt_cache) {
3254                                         rc = ip6_del_cached_rt(rt_cache, cfg);
3255                                         if (rc != -ESRCH) {
3256                                                 rcu_read_unlock();
3257                                                 return rc;
3258                                         }
3259                                 }
3260                                 continue;
3261                         }
3262                         if (cfg->fc_ifindex &&
3263                             (!rt->fib6_nh.nh_dev ||
3264                              rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3265                                 continue;
3266                         if (cfg->fc_flags & RTF_GATEWAY &&
3267                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3268                                 continue;
3269                         if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3270                                 continue;
3271                         if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3272                                 continue;
3273                         if (!fib6_info_hold_safe(rt))
3274                                 continue;
3275                         rcu_read_unlock();
3276
3277                         /* if gateway was specified only delete the one hop */
3278                         if (cfg->fc_flags & RTF_GATEWAY)
3279                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3280
3281                         return __ip6_del_rt_siblings(rt, cfg);
3282                 }
3283         }
3284         rcu_read_unlock();
3285
3286         return err;
3287 }
3288
3289 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3290 {
3291         struct netevent_redirect netevent;
3292         struct rt6_info *rt, *nrt = NULL;
3293         struct ndisc_options ndopts;
3294         struct inet6_dev *in6_dev;
3295         struct neighbour *neigh;
3296         struct fib6_info *from;
3297         struct rd_msg *msg;
3298         int optlen, on_link;
3299         u8 *lladdr;
3300
3301         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3302         optlen -= sizeof(*msg);
3303
3304         if (optlen < 0) {
3305                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3306                 return;
3307         }
3308
3309         msg = (struct rd_msg *)icmp6_hdr(skb);
3310
3311         if (ipv6_addr_is_multicast(&msg->dest)) {
3312                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3313                 return;
3314         }
3315
3316         on_link = 0;
3317         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3318                 on_link = 1;
3319         } else if (ipv6_addr_type(&msg->target) !=
3320                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3321                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3322                 return;
3323         }
3324
3325         in6_dev = __in6_dev_get(skb->dev);
3326         if (!in6_dev)
3327                 return;
3328         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3329                 return;
3330
3331         /* RFC2461 8.1:
3332          *      The IP source address of the Redirect MUST be the same as the current
3333          *      first-hop router for the specified ICMP Destination Address.
3334          */
3335
3336         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3337                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3338                 return;
3339         }
3340
3341         lladdr = NULL;
3342         if (ndopts.nd_opts_tgt_lladdr) {
3343                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3344                                              skb->dev);
3345                 if (!lladdr) {
3346                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3347                         return;
3348                 }
3349         }
3350
3351         rt = (struct rt6_info *) dst;
3352         if (rt->rt6i_flags & RTF_REJECT) {
3353                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3354                 return;
3355         }
3356
3357         /* Redirect received -> path was valid.
3358          * Look, redirects are sent only in response to data packets,
3359          * so that this nexthop apparently is reachable. --ANK
3360          */
3361         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3362
3363         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3364         if (!neigh)
3365                 return;
3366
3367         /*
3368          *      We have finally decided to accept it.
3369          */
3370
3371         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3372                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3373                      NEIGH_UPDATE_F_OVERRIDE|
3374                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3375                                      NEIGH_UPDATE_F_ISROUTER)),
3376                      NDISC_REDIRECT, &ndopts);
3377
3378         rcu_read_lock();
3379         from = rcu_dereference(rt->from);
3380         /* This fib6_info_hold() is safe here because we hold reference to rt
3381          * and rt already holds reference to fib6_info.
3382          */
3383         fib6_info_hold(from);
3384         rcu_read_unlock();
3385
3386         nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3387         if (!nrt)
3388                 goto out;
3389
3390         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3391         if (on_link)
3392                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3393
3394         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3395
3396         /* No need to remove rt from the exception table if rt is
3397          * a cached route because rt6_insert_exception() will
3398          * takes care of it
3399          */
3400         if (rt6_insert_exception(nrt, from)) {
3401                 dst_release_immediate(&nrt->dst);
3402                 goto out;
3403         }
3404
3405         netevent.old = &rt->dst;
3406         netevent.new = &nrt->dst;
3407         netevent.daddr = &msg->dest;
3408         netevent.neigh = neigh;
3409         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3410
3411 out:
3412         fib6_info_release(from);
3413         neigh_release(neigh);
3414 }
3415
3416 #ifdef CONFIG_IPV6_ROUTE_INFO
3417 static struct fib6_info *rt6_get_route_info(struct net *net,
3418                                            const struct in6_addr *prefix, int prefixlen,
3419                                            const struct in6_addr *gwaddr,
3420                                            struct net_device *dev)
3421 {
3422         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3423         int ifindex = dev->ifindex;
3424         struct fib6_node *fn;
3425         struct fib6_info *rt = NULL;
3426         struct fib6_table *table;
3427
3428         table = fib6_get_table(net, tb_id);
3429         if (!table)
3430                 return NULL;
3431
3432         rcu_read_lock();
3433         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3434         if (!fn)
3435                 goto out;
3436
3437         for_each_fib6_node_rt_rcu(fn) {
3438                 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3439                         continue;
3440                 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3441                         continue;
3442                 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3443                         continue;
3444                 if (!fib6_info_hold_safe(rt))
3445                         continue;
3446                 break;
3447         }
3448 out:
3449         rcu_read_unlock();
3450         return rt;
3451 }
3452
3453 static struct fib6_info *rt6_add_route_info(struct net *net,
3454                                            const struct in6_addr *prefix, int prefixlen,
3455                                            const struct in6_addr *gwaddr,
3456                                            struct net_device *dev,
3457                                            unsigned int pref)
3458 {
3459         struct fib6_config cfg = {
3460                 .fc_metric      = IP6_RT_PRIO_USER,
3461                 .fc_ifindex     = dev->ifindex,
3462                 .fc_dst_len     = prefixlen,
3463                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3464                                   RTF_UP | RTF_PREF(pref),
3465                 .fc_protocol = RTPROT_RA,
3466                 .fc_type = RTN_UNICAST,
3467                 .fc_nlinfo.portid = 0,
3468                 .fc_nlinfo.nlh = NULL,
3469                 .fc_nlinfo.nl_net = net,
3470         };
3471
3472         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3473         cfg.fc_dst = *prefix;
3474         cfg.fc_gateway = *gwaddr;
3475
3476         /* We should treat it as a default route if prefix length is 0. */
3477         if (!prefixlen)
3478                 cfg.fc_flags |= RTF_DEFAULT;
3479
3480         ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3481
3482         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3483 }
3484 #endif
3485
3486 struct fib6_info *rt6_get_dflt_router(struct net *net,
3487                                      const struct in6_addr *addr,
3488                                      struct net_device *dev)
3489 {
3490         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3491         struct fib6_info *rt;
3492         struct fib6_table *table;
3493
3494         table = fib6_get_table(net, tb_id);
3495         if (!table)
3496                 return NULL;
3497
3498         rcu_read_lock();
3499         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3500                 if (dev == rt->fib6_nh.nh_dev &&
3501                     ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3502                     ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3503                         break;
3504         }
3505         if (rt && !fib6_info_hold_safe(rt))
3506                 rt = NULL;
3507         rcu_read_unlock();
3508         return rt;
3509 }
3510
3511 struct fib6_info *rt6_add_dflt_router(struct net *net,
3512                                      const struct in6_addr *gwaddr,
3513                                      struct net_device *dev,
3514                                      unsigned int pref)
3515 {
3516         struct fib6_config cfg = {
3517                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3518                 .fc_metric      = IP6_RT_PRIO_USER,
3519                 .fc_ifindex     = dev->ifindex,
3520                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3521                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3522                 .fc_protocol = RTPROT_RA,
3523                 .fc_type = RTN_UNICAST,
3524                 .fc_nlinfo.portid = 0,
3525                 .fc_nlinfo.nlh = NULL,
3526                 .fc_nlinfo.nl_net = net,
3527         };
3528
3529         cfg.fc_gateway = *gwaddr;
3530
3531         if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3532                 struct fib6_table *table;
3533
3534                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3535                 if (table)
3536                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3537         }
3538
3539         return rt6_get_dflt_router(net, gwaddr, dev);
3540 }
3541
3542 static void __rt6_purge_dflt_routers(struct net *net,
3543                                      struct fib6_table *table)
3544 {
3545         struct fib6_info *rt;
3546
3547 restart:
3548         rcu_read_lock();
3549         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3550                 struct net_device *dev = fib6_info_nh_dev(rt);
3551                 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3552
3553                 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3554                     (!idev || idev->cnf.accept_ra != 2) &&
3555                     fib6_info_hold_safe(rt)) {
3556                         rcu_read_unlock();
3557                         ip6_del_rt(net, rt);
3558                         goto restart;
3559                 }
3560         }
3561         rcu_read_unlock();
3562
3563         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3564 }
3565
3566 void rt6_purge_dflt_routers(struct net *net)
3567 {
3568         struct fib6_table *table;
3569         struct hlist_head *head;
3570         unsigned int h;
3571
3572         rcu_read_lock();
3573
3574         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3575                 head = &net->ipv6.fib_table_hash[h];
3576                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3577                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3578                                 __rt6_purge_dflt_routers(net, table);
3579                 }
3580         }
3581
3582         rcu_read_unlock();
3583 }
3584
3585 static void rtmsg_to_fib6_config(struct net *net,
3586                                  struct in6_rtmsg *rtmsg,
3587                                  struct fib6_config *cfg)
3588 {
3589         *cfg = (struct fib6_config){
3590                 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3591                          : RT6_TABLE_MAIN,
3592                 .fc_ifindex = rtmsg->rtmsg_ifindex,
3593                 .fc_metric = rtmsg->rtmsg_metric,
3594                 .fc_expires = rtmsg->rtmsg_info,
3595                 .fc_dst_len = rtmsg->rtmsg_dst_len,
3596                 .fc_src_len = rtmsg->rtmsg_src_len,
3597                 .fc_flags = rtmsg->rtmsg_flags,
3598                 .fc_type = rtmsg->rtmsg_type,
3599
3600                 .fc_nlinfo.nl_net = net,
3601
3602                 .fc_dst = rtmsg->rtmsg_dst,
3603                 .fc_src = rtmsg->rtmsg_src,
3604                 .fc_gateway = rtmsg->rtmsg_gateway,
3605         };
3606 }
3607
3608 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3609 {
3610         struct fib6_config cfg;
3611         struct in6_rtmsg rtmsg;
3612         int err;
3613
3614         switch (cmd) {
3615         case SIOCADDRT:         /* Add a route */
3616         case SIOCDELRT:         /* Delete a route */
3617                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3618                         return -EPERM;
3619                 err = copy_from_user(&rtmsg, arg,
3620                                      sizeof(struct in6_rtmsg));
3621                 if (err)
3622                         return -EFAULT;
3623
3624                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3625
3626                 rtnl_lock();
3627                 switch (cmd) {
3628                 case SIOCADDRT:
3629                         err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3630                         break;
3631                 case SIOCDELRT:
3632                         err = ip6_route_del(&cfg, NULL);
3633                         break;
3634                 default:
3635                         err = -EINVAL;
3636                 }
3637                 rtnl_unlock();
3638
3639                 return err;
3640         }
3641
3642         return -EINVAL;
3643 }
3644
3645 /*
3646  *      Drop the packet on the floor
3647  */
3648
3649 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3650 {
3651         int type;
3652         struct dst_entry *dst = skb_dst(skb);
3653         switch (ipstats_mib_noroutes) {
3654         case IPSTATS_MIB_INNOROUTES:
3655                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3656                 if (type == IPV6_ADDR_ANY) {
3657                         IP6_INC_STATS(dev_net(dst->dev),
3658                                       __in6_dev_get_safely(skb->dev),
3659                                       IPSTATS_MIB_INADDRERRORS);
3660                         break;
3661                 }
3662                 /* FALLTHROUGH */
3663         case IPSTATS_MIB_OUTNOROUTES:
3664                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3665                               ipstats_mib_noroutes);
3666                 break;
3667         }
3668         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3669         kfree_skb(skb);
3670         return 0;
3671 }
3672
3673 static int ip6_pkt_discard(struct sk_buff *skb)
3674 {
3675         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3676 }
3677
3678 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3679 {
3680         skb->dev = skb_dst(skb)->dev;
3681         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3682 }
3683
3684 static int ip6_pkt_prohibit(struct sk_buff *skb)
3685 {
3686         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3687 }
3688
3689 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3690 {
3691         skb->dev = skb_dst(skb)->dev;
3692         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3693 }
3694
3695 /*
3696  *      Allocate a dst for local (unicast / anycast) address.
3697  */
3698
3699 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3700                                      struct inet6_dev *idev,
3701                                      const struct in6_addr *addr,
3702                                      bool anycast, gfp_t gfp_flags)
3703 {
3704         u32 tb_id;
3705         struct net_device *dev = idev->dev;
3706         struct fib6_info *f6i;
3707
3708         f6i = fib6_info_alloc(gfp_flags);
3709         if (!f6i)
3710                 return ERR_PTR(-ENOMEM);
3711
3712         f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3713         f6i->dst_nocount = true;
3714         f6i->dst_host = true;
3715         f6i->fib6_protocol = RTPROT_KERNEL;
3716         f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3717         if (anycast) {
3718                 f6i->fib6_type = RTN_ANYCAST;
3719                 f6i->fib6_flags |= RTF_ANYCAST;
3720         } else {
3721                 f6i->fib6_type = RTN_LOCAL;
3722                 f6i->fib6_flags |= RTF_LOCAL;
3723         }
3724
3725         f6i->fib6_nh.nh_gw = *addr;
3726         dev_hold(dev);
3727         f6i->fib6_nh.nh_dev = dev;
3728         f6i->fib6_dst.addr = *addr;
3729         f6i->fib6_dst.plen = 128;
3730         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3731         f6i->fib6_table = fib6_get_table(net, tb_id);
3732
3733         return f6i;
3734 }
3735
3736 /* remove deleted ip from prefsrc entries */
3737 struct arg_dev_net_ip {
3738         struct net_device *dev;
3739         struct net *net;
3740         struct in6_addr *addr;
3741 };
3742
3743 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3744 {
3745         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3746         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3747         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3748
3749         if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3750             rt != net->ipv6.fib6_null_entry &&
3751             ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3752                 spin_lock_bh(&rt6_exception_lock);
3753                 /* remove prefsrc entry */
3754                 rt->fib6_prefsrc.plen = 0;
3755                 spin_unlock_bh(&rt6_exception_lock);
3756         }
3757         return 0;
3758 }
3759
3760 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3761 {
3762         struct net *net = dev_net(ifp->idev->dev);
3763         struct arg_dev_net_ip adni = {
3764                 .dev = ifp->idev->dev,
3765                 .net = net,
3766                 .addr = &ifp->addr,
3767         };
3768         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3769 }
3770
3771 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3772
3773 /* Remove routers and update dst entries when gateway turn into host. */
3774 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3775 {
3776         struct in6_addr *gateway = (struct in6_addr *)arg;
3777
3778         if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3779             ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3780                 return -1;
3781         }
3782
3783         /* Further clean up cached routes in exception table.
3784          * This is needed because cached route may have a different
3785          * gateway than its 'parent' in the case of an ip redirect.
3786          */
3787         rt6_exceptions_clean_tohost(rt, gateway);
3788
3789         return 0;
3790 }
3791
3792 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3793 {
3794         fib6_clean_all(net, fib6_clean_tohost, gateway);
3795 }
3796
3797 struct arg_netdev_event {
3798         const struct net_device *dev;
3799         union {
3800                 unsigned int nh_flags;
3801                 unsigned long event;
3802         };
3803 };
3804
3805 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3806 {
3807         struct fib6_info *iter;
3808         struct fib6_node *fn;
3809
3810         fn = rcu_dereference_protected(rt->fib6_node,
3811                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3812         iter = rcu_dereference_protected(fn->leaf,
3813                         lockdep_is_held(&rt->fib6_table->tb6_lock));
3814         while (iter) {
3815                 if (iter->fib6_metric == rt->fib6_metric &&
3816                     rt6_qualify_for_ecmp(iter))
3817                         return iter;
3818                 iter = rcu_dereference_protected(iter->fib6_next,
3819                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
3820         }
3821
3822         return NULL;
3823 }
3824
3825 static bool rt6_is_dead(const struct fib6_info *rt)
3826 {
3827         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3828             (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3829              fib6_ignore_linkdown(rt)))
3830                 return true;
3831
3832         return false;
3833 }
3834
3835 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3836 {
3837         struct fib6_info *iter;
3838         int total = 0;
3839
3840         if (!rt6_is_dead(rt))
3841                 total += rt->fib6_nh.nh_weight;
3842
3843         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3844                 if (!rt6_is_dead(iter))
3845                         total += iter->fib6_nh.nh_weight;
3846         }
3847
3848         return total;
3849 }
3850
3851 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3852 {
3853         int upper_bound = -1;
3854
3855         if (!rt6_is_dead(rt)) {
3856                 *weight += rt->fib6_nh.nh_weight;
3857                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3858                                                     total) - 1;
3859         }
3860         atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3861 }
3862
3863 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3864 {
3865         struct fib6_info *iter;
3866         int weight = 0;
3867
3868         rt6_upper_bound_set(rt, &weight, total);
3869
3870         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3871                 rt6_upper_bound_set(iter, &weight, total);
3872 }
3873
3874 void rt6_multipath_rebalance(struct fib6_info *rt)
3875 {
3876         struct fib6_info *first;
3877         int total;
3878
3879         /* In case the entire multipath route was marked for flushing,
3880          * then there is no need to rebalance upon the removal of every
3881          * sibling route.
3882          */
3883         if (!rt->fib6_nsiblings || rt->should_flush)
3884                 return;
3885
3886         /* During lookup routes are evaluated in order, so we need to
3887          * make sure upper bounds are assigned from the first sibling
3888          * onwards.
3889          */
3890         first = rt6_multipath_first_sibling(rt);
3891         if (WARN_ON_ONCE(!first))
3892                 return;
3893
3894         total = rt6_multipath_total_weight(first);
3895         rt6_multipath_upper_bound_set(first, total);
3896 }
3897
3898 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3899 {
3900         const struct arg_netdev_event *arg = p_arg;
3901         struct net *net = dev_net(arg->dev);
3902
3903         if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3904                 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3905                 fib6_update_sernum_upto_root(net, rt);
3906                 rt6_multipath_rebalance(rt);
3907         }
3908
3909         return 0;
3910 }
3911
3912 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3913 {
3914         struct arg_netdev_event arg = {
3915                 .dev = dev,
3916                 {
3917                         .nh_flags = nh_flags,
3918                 },
3919         };
3920
3921         if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3922                 arg.nh_flags |= RTNH_F_LINKDOWN;
3923
3924         fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3925 }
3926
3927 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3928                                    const struct net_device *dev)
3929 {
3930         struct fib6_info *iter;
3931
3932         if (rt->fib6_nh.nh_dev == dev)
3933                 return true;
3934         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3935                 if (iter->fib6_nh.nh_dev == dev)
3936                         return true;
3937
3938         return false;
3939 }
3940
3941 static void rt6_multipath_flush(struct fib6_info *rt)
3942 {
3943         struct fib6_info *iter;
3944
3945         rt->should_flush = 1;
3946         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3947                 iter->should_flush = 1;
3948 }
3949
3950 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3951                                              const struct net_device *down_dev)
3952 {
3953         struct fib6_info *iter;
3954         unsigned int dead = 0;
3955
3956         if (rt->fib6_nh.nh_dev == down_dev ||
3957             rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3958                 dead++;
3959         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3960                 if (iter->fib6_nh.nh_dev == down_dev ||
3961                     iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3962                         dead++;
3963
3964         return dead;
3965 }
3966
3967 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3968                                        const struct net_device *dev,
3969                                        unsigned int nh_flags)
3970 {
3971         struct fib6_info *iter;
3972
3973         if (rt->fib6_nh.nh_dev == dev)
3974                 rt->fib6_nh.nh_flags |= nh_flags;
3975         list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3976                 if (iter->fib6_nh.nh_dev == dev)
3977                         iter->fib6_nh.nh_flags |= nh_flags;
3978 }
3979
3980 /* called with write lock held for table with rt */
3981 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3982 {
3983         const struct arg_netdev_event *arg = p_arg;
3984         const struct net_device *dev = arg->dev;
3985         struct net *net = dev_net(dev);
3986
3987         if (rt == net->ipv6.fib6_null_entry)
3988                 return 0;
3989
3990         switch (arg->event) {
3991         case NETDEV_UNREGISTER:
3992                 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3993         case NETDEV_DOWN:
3994                 if (rt->should_flush)
3995                         return -1;
3996                 if (!rt->fib6_nsiblings)
3997                         return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3998                 if (rt6_multipath_uses_dev(rt, dev)) {
3999                         unsigned int count;
4000
4001                         count = rt6_multipath_dead_count(rt, dev);
4002                         if (rt->fib6_nsiblings + 1 == count) {
4003                                 rt6_multipath_flush(rt);
4004                                 return -1;
4005                         }
4006                         rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4007                                                    RTNH_F_LINKDOWN);
4008                         fib6_update_sernum(net, rt);
4009                         rt6_multipath_rebalance(rt);
4010                 }
4011                 return -2;
4012         case NETDEV_CHANGE:
4013                 if (rt->fib6_nh.nh_dev != dev ||
4014                     rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4015                         break;
4016                 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4017                 rt6_multipath_rebalance(rt);
4018                 break;
4019         }
4020
4021         return 0;
4022 }
4023
4024 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4025 {
4026         struct arg_netdev_event arg = {
4027                 .dev = dev,
4028                 {
4029                         .event = event,
4030                 },
4031         };
4032         struct net *net = dev_net(dev);
4033
4034         if (net->ipv6.sysctl.skip_notify_on_dev_down)
4035                 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4036         else
4037                 fib6_clean_all(net, fib6_ifdown, &arg);
4038 }
4039
4040 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4041 {
4042         rt6_sync_down_dev(dev, event);
4043         rt6_uncached_list_flush_dev(dev_net(dev), dev);
4044         neigh_ifdown(&nd_tbl, dev);
4045 }
4046
4047 struct rt6_mtu_change_arg {
4048         struct net_device *dev;
4049         unsigned int mtu;
4050 };
4051
4052 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4053 {
4054         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4055         struct inet6_dev *idev;
4056
4057         /* In IPv6 pmtu discovery is not optional,
4058            so that RTAX_MTU lock cannot disable it.
4059            We still use this lock to block changes
4060            caused by addrconf/ndisc.
4061         */
4062
4063         idev = __in6_dev_get(arg->dev);
4064         if (!idev)
4065                 return 0;
4066
4067         /* For administrative MTU increase, there is no way to discover
4068            IPv6 PMTU increase, so PMTU increase should be updated here.
4069            Since RFC 1981 doesn't include administrative MTU increase
4070            update PMTU increase is a MUST. (i.e. jumbo frame)
4071          */
4072         if (rt->fib6_nh.nh_dev == arg->dev &&
4073             !fib6_metric_locked(rt, RTAX_MTU)) {
4074                 u32 mtu = rt->fib6_pmtu;
4075
4076                 if (mtu >= arg->mtu ||
4077                     (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4078                         fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4079
4080                 spin_lock_bh(&rt6_exception_lock);
4081                 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4082                 spin_unlock_bh(&rt6_exception_lock);
4083         }
4084         return 0;
4085 }
4086
4087 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4088 {
4089         struct rt6_mtu_change_arg arg = {
4090                 .dev = dev,
4091                 .mtu = mtu,
4092         };
4093
4094         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4095 }
4096
4097 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4098         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4099         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
4100         [RTA_OIF]               = { .type = NLA_U32 },
4101         [RTA_IIF]               = { .type = NLA_U32 },
4102         [RTA_PRIORITY]          = { .type = NLA_U32 },
4103         [RTA_METRICS]           = { .type = NLA_NESTED },
4104         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
4105         [RTA_PREF]              = { .type = NLA_U8 },
4106         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
4107         [RTA_ENCAP]             = { .type = NLA_NESTED },
4108         [RTA_EXPIRES]           = { .type = NLA_U32 },
4109         [RTA_UID]               = { .type = NLA_U32 },
4110         [RTA_MARK]              = { .type = NLA_U32 },
4111         [RTA_TABLE]             = { .type = NLA_U32 },
4112         [RTA_IP_PROTO]          = { .type = NLA_U8 },
4113         [RTA_SPORT]             = { .type = NLA_U16 },
4114         [RTA_DPORT]             = { .type = NLA_U16 },
4115 };
4116
4117 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4118                               struct fib6_config *cfg,
4119                               struct netlink_ext_ack *extack)
4120 {
4121         struct rtmsg *rtm;
4122         struct nlattr *tb[RTA_MAX+1];
4123         unsigned int pref;
4124         int err;
4125
4126         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4127                           extack);
4128         if (err < 0)
4129                 goto errout;
4130
4131         err = -EINVAL;
4132         rtm = nlmsg_data(nlh);
4133
4134         *cfg = (struct fib6_config){
4135                 .fc_table = rtm->rtm_table,
4136                 .fc_dst_len = rtm->rtm_dst_len,
4137                 .fc_src_len = rtm->rtm_src_len,
4138                 .fc_flags = RTF_UP,
4139                 .fc_protocol = rtm->rtm_protocol,
4140                 .fc_type = rtm->rtm_type,
4141
4142                 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4143                 .fc_nlinfo.nlh = nlh,
4144                 .fc_nlinfo.nl_net = sock_net(skb->sk),
4145         };
4146
4147         if (rtm->rtm_type == RTN_UNREACHABLE ||
4148             rtm->rtm_type == RTN_BLACKHOLE ||
4149             rtm->rtm_type == RTN_PROHIBIT ||
4150             rtm->rtm_type == RTN_THROW)
4151                 cfg->fc_flags |= RTF_REJECT;
4152
4153         if (rtm->rtm_type == RTN_LOCAL)
4154                 cfg->fc_flags |= RTF_LOCAL;
4155
4156         if (rtm->rtm_flags & RTM_F_CLONED)
4157                 cfg->fc_flags |= RTF_CACHE;
4158
4159         cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4160
4161         if (tb[RTA_GATEWAY]) {
4162                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4163                 cfg->fc_flags |= RTF_GATEWAY;
4164         }
4165
4166         if (tb[RTA_DST]) {
4167                 int plen = (rtm->rtm_dst_len + 7) >> 3;
4168
4169                 if (nla_len(tb[RTA_DST]) < plen)
4170                         goto errout;
4171
4172                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4173         }
4174
4175         if (tb[RTA_SRC]) {
4176                 int plen = (rtm->rtm_src_len + 7) >> 3;
4177
4178                 if (nla_len(tb[RTA_SRC]) < plen)
4179                         goto errout;
4180
4181                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4182         }
4183
4184         if (tb[RTA_PREFSRC])
4185                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4186
4187         if (tb[RTA_OIF])
4188                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4189
4190         if (tb[RTA_PRIORITY])
4191                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4192
4193         if (tb[RTA_METRICS]) {
4194                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4195                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4196         }
4197
4198         if (tb[RTA_TABLE])
4199                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4200
4201         if (tb[RTA_MULTIPATH]) {
4202                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4203                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4204
4205                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4206                                                      cfg->fc_mp_len, extack);
4207                 if (err < 0)
4208                         goto errout;
4209         }
4210
4211         if (tb[RTA_PREF]) {
4212                 pref = nla_get_u8(tb[RTA_PREF]);
4213                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4214                     pref != ICMPV6_ROUTER_PREF_HIGH)
4215                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
4216                 cfg->fc_flags |= RTF_PREF(pref);
4217         }
4218
4219         if (tb[RTA_ENCAP])
4220                 cfg->fc_encap = tb[RTA_ENCAP];
4221
4222         if (tb[RTA_ENCAP_TYPE]) {
4223                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4224
4225                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4226                 if (err < 0)
4227                         goto errout;
4228         }
4229
4230         if (tb[RTA_EXPIRES]) {
4231                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4232
4233                 if (addrconf_finite_timeout(timeout)) {
4234                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4235                         cfg->fc_flags |= RTF_EXPIRES;
4236                 }
4237         }
4238
4239         err = 0;
4240 errout:
4241         return err;
4242 }
4243
4244 struct rt6_nh {
4245         struct fib6_info *fib6_info;
4246         struct fib6_config r_cfg;
4247         struct list_head next;
4248 };
4249
4250 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4251 {
4252         struct rt6_nh *nh;
4253
4254         list_for_each_entry(nh, rt6_nh_list, next) {
4255                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4256                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4257                         nh->r_cfg.fc_ifindex);
4258         }
4259 }
4260
4261 static int ip6_route_info_append(struct net *net,
4262                                  struct list_head *rt6_nh_list,
4263                                  struct fib6_info *rt,
4264                                  struct fib6_config *r_cfg)
4265 {
4266         struct rt6_nh *nh;
4267         int err = -EEXIST;
4268
4269         list_for_each_entry(nh, rt6_nh_list, next) {
4270                 /* check if fib6_info already exists */
4271                 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4272                         return err;
4273         }
4274
4275         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4276         if (!nh)
4277                 return -ENOMEM;
4278         nh->fib6_info = rt;
4279         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4280         list_add_tail(&nh->next, rt6_nh_list);
4281
4282         return 0;
4283 }
4284
4285 static void ip6_route_mpath_notify(struct fib6_info *rt,
4286                                    struct fib6_info *rt_last,
4287                                    struct nl_info *info,
4288                                    __u16 nlflags)
4289 {
4290         /* if this is an APPEND route, then rt points to the first route
4291          * inserted and rt_last points to last route inserted. Userspace
4292          * wants a consistent dump of the route which starts at the first
4293          * nexthop. Since sibling routes are always added at the end of
4294          * the list, find the first sibling of the last route appended
4295          */
4296         if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4297                 rt = list_first_entry(&rt_last->fib6_siblings,
4298                                       struct fib6_info,
4299                                       fib6_siblings);
4300         }
4301
4302         if (rt)
4303                 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4304 }
4305
4306 static int ip6_route_multipath_add(struct fib6_config *cfg,
4307                                    struct netlink_ext_ack *extack)
4308 {
4309         struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4310         struct nl_info *info = &cfg->fc_nlinfo;
4311         struct fib6_config r_cfg;
4312         struct rtnexthop *rtnh;
4313         struct fib6_info *rt;
4314         struct rt6_nh *err_nh;
4315         struct rt6_nh *nh, *nh_safe;
4316         __u16 nlflags;
4317         int remaining;
4318         int attrlen;
4319         int err = 1;
4320         int nhn = 0;
4321         int replace = (cfg->fc_nlinfo.nlh &&
4322                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4323         LIST_HEAD(rt6_nh_list);
4324
4325         nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4326         if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4327                 nlflags |= NLM_F_APPEND;
4328
4329         remaining = cfg->fc_mp_len;
4330         rtnh = (struct rtnexthop *)cfg->fc_mp;
4331
4332         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4333          * fib6_info structs per nexthop
4334          */
4335         while (rtnh_ok(rtnh, remaining)) {
4336                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4337                 if (rtnh->rtnh_ifindex)
4338                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4339
4340                 attrlen = rtnh_attrlen(rtnh);
4341                 if (attrlen > 0) {
4342                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4343
4344                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4345                         if (nla) {
4346                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4347                                 r_cfg.fc_flags |= RTF_GATEWAY;
4348                         }
4349                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4350                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4351                         if (nla)
4352                                 r_cfg.fc_encap_type = nla_get_u16(nla);
4353                 }
4354
4355                 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4356                 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4357                 if (IS_ERR(rt)) {
4358                         err = PTR_ERR(rt);
4359                         rt = NULL;
4360                         goto cleanup;
4361                 }
4362                 if (!rt6_qualify_for_ecmp(rt)) {
4363                         err = -EINVAL;
4364                         NL_SET_ERR_MSG(extack,
4365                                        "Device only routes can not be added for IPv6 using the multipath API.");
4366                         fib6_info_release(rt);
4367                         goto cleanup;
4368                 }
4369
4370                 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4371
4372                 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4373                                             rt, &r_cfg);
4374                 if (err) {
4375                         fib6_info_release(rt);
4376                         goto cleanup;
4377                 }
4378
4379                 rtnh = rtnh_next(rtnh, &remaining);
4380         }
4381
4382         /* for add and replace send one notification with all nexthops.
4383          * Skip the notification in fib6_add_rt2node and send one with
4384          * the full route when done
4385          */
4386         info->skip_notify = 1;
4387
4388         err_nh = NULL;
4389         list_for_each_entry(nh, &rt6_nh_list, next) {
4390                 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4391                 fib6_info_release(nh->fib6_info);
4392
4393                 if (!err) {
4394                         /* save reference to last route successfully inserted */
4395                         rt_last = nh->fib6_info;
4396
4397                         /* save reference to first route for notification */
4398                         if (!rt_notif)
4399                                 rt_notif = nh->fib6_info;
4400                 }
4401
4402                 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4403                 nh->fib6_info = NULL;
4404                 if (err) {
4405                         if (replace && nhn)
4406                                 ip6_print_replace_route_err(&rt6_nh_list);
4407                         err_nh = nh;
4408                         goto add_errout;
4409                 }
4410
4411                 /* Because each route is added like a single route we remove
4412                  * these flags after the first nexthop: if there is a collision,
4413                  * we have already failed to add the first nexthop:
4414                  * fib6_add_rt2node() has rejected it; when replacing, old
4415                  * nexthops have been replaced by first new, the rest should
4416                  * be added to it.
4417                  */
4418                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4419                                                      NLM_F_REPLACE);
4420                 nhn++;
4421         }
4422
4423         /* success ... tell user about new route */
4424         ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4425         goto cleanup;
4426
4427 add_errout:
4428         /* send notification for routes that were added so that
4429          * the delete notifications sent by ip6_route_del are
4430          * coherent
4431          */
4432         if (rt_notif)
4433                 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4434
4435         /* Delete routes that were already added */
4436         list_for_each_entry(nh, &rt6_nh_list, next) {
4437                 if (err_nh == nh)
4438                         break;
4439                 ip6_route_del(&nh->r_cfg, extack);
4440         }
4441
4442 cleanup:
4443         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4444                 if (nh->fib6_info)
4445                         fib6_info_release(nh->fib6_info);
4446                 list_del(&nh->next);
4447                 kfree(nh);
4448         }
4449
4450         return err;
4451 }
4452
4453 static int ip6_route_multipath_del(struct fib6_config *cfg,
4454                                    struct netlink_ext_ack *extack)
4455 {
4456         struct fib6_config r_cfg;
4457         struct rtnexthop *rtnh;
4458         int remaining;
4459         int attrlen;
4460         int err = 1, last_err = 0;
4461
4462         remaining = cfg->fc_mp_len;
4463         rtnh = (struct rtnexthop *)cfg->fc_mp;
4464
4465         /* Parse a Multipath Entry */
4466         while (rtnh_ok(rtnh, remaining)) {
4467                 memcpy(&r_cfg, cfg, sizeof(*cfg));
4468                 if (rtnh->rtnh_ifindex)
4469                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4470
4471                 attrlen = rtnh_attrlen(rtnh);
4472                 if (attrlen > 0) {
4473                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4474
4475                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4476                         if (nla) {
4477                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4478                                 r_cfg.fc_flags |= RTF_GATEWAY;
4479                         }
4480                 }
4481                 err = ip6_route_del(&r_cfg, extack);
4482                 if (err)
4483                         last_err = err;
4484
4485                 rtnh = rtnh_next(rtnh, &remaining);
4486         }
4487
4488         return last_err;
4489 }
4490
4491 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4492                               struct netlink_ext_ack *extack)
4493 {
4494         struct fib6_config cfg;
4495         int err;
4496
4497         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4498         if (err < 0)
4499                 return err;
4500
4501         if (cfg.fc_mp)
4502                 return ip6_route_multipath_del(&cfg, extack);
4503         else {
4504                 cfg.fc_delete_all_nh = 1;
4505                 return ip6_route_del(&cfg, extack);
4506         }
4507 }
4508
4509 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4510                               struct netlink_ext_ack *extack)
4511 {
4512         struct fib6_config cfg;
4513         int err;
4514
4515         err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4516         if (err < 0)
4517                 return err;
4518
4519         if (cfg.fc_mp)
4520                 return ip6_route_multipath_add(&cfg, extack);
4521         else
4522                 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4523 }
4524
4525 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4526 {
4527         int nexthop_len = 0;
4528
4529         if (rt->fib6_nsiblings) {
4530                 nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
4531                             + NLA_ALIGN(sizeof(struct rtnexthop))
4532                             + nla_total_size(16) /* RTA_GATEWAY */
4533                             + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4534
4535                 nexthop_len *= rt->fib6_nsiblings;
4536         }
4537
4538         return NLMSG_ALIGN(sizeof(struct rtmsg))
4539                + nla_total_size(16) /* RTA_SRC */
4540                + nla_total_size(16) /* RTA_DST */
4541                + nla_total_size(16) /* RTA_GATEWAY */
4542                + nla_total_size(16) /* RTA_PREFSRC */
4543                + nla_total_size(4) /* RTA_TABLE */
4544                + nla_total_size(4) /* RTA_IIF */
4545                + nla_total_size(4) /* RTA_OIF */
4546                + nla_total_size(4) /* RTA_PRIORITY */
4547                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4548                + nla_total_size(sizeof(struct rta_cacheinfo))
4549                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4550                + nla_total_size(1) /* RTA_PREF */
4551                + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4552                + nexthop_len;
4553 }
4554
4555 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4556                             unsigned int *flags, bool skip_oif)
4557 {
4558         if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4559                 *flags |= RTNH_F_DEAD;
4560
4561         if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4562                 *flags |= RTNH_F_LINKDOWN;
4563
4564                 rcu_read_lock();
4565                 if (fib6_ignore_linkdown(rt))
4566                         *flags |= RTNH_F_DEAD;
4567                 rcu_read_unlock();
4568         }
4569
4570         if (rt->fib6_flags & RTF_GATEWAY) {
4571                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4572                         goto nla_put_failure;
4573         }
4574
4575         *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4576         if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4577                 *flags |= RTNH_F_OFFLOAD;
4578
4579         /* not needed for multipath encoding b/c it has a rtnexthop struct */
4580         if (!skip_oif && rt->fib6_nh.nh_dev &&
4581             nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4582                 goto nla_put_failure;
4583
4584         if (rt->fib6_nh.nh_lwtstate &&
4585             lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4586                 goto nla_put_failure;
4587
4588         return 0;
4589
4590 nla_put_failure:
4591         return -EMSGSIZE;
4592 }
4593
4594 /* add multipath next hop */
4595 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4596 {
4597         const struct net_device *dev = rt->fib6_nh.nh_dev;
4598         struct rtnexthop *rtnh;
4599         unsigned int flags = 0;
4600
4601         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4602         if (!rtnh)
4603                 goto nla_put_failure;
4604
4605         rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4606         rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4607
4608         if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4609                 goto nla_put_failure;
4610
4611         rtnh->rtnh_flags = flags;
4612
4613         /* length of rtnetlink header + attributes */
4614         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4615
4616         return 0;
4617
4618 nla_put_failure:
4619         return -EMSGSIZE;
4620 }
4621
4622 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4623                          struct fib6_info *rt, struct dst_entry *dst,
4624                          struct in6_addr *dest, struct in6_addr *src,
4625                          int iif, int type, u32 portid, u32 seq,
4626                          unsigned int flags)
4627 {
4628         struct rt6_info *rt6 = (struct rt6_info *)dst;
4629         struct rt6key *rt6_dst, *rt6_src;
4630         u32 *pmetrics, table, rt6_flags;
4631         struct nlmsghdr *nlh;
4632         struct rtmsg *rtm;
4633         long expires = 0;
4634
4635         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4636         if (!nlh)
4637                 return -EMSGSIZE;
4638
4639         if (rt6) {
4640                 rt6_dst = &rt6->rt6i_dst;
4641                 rt6_src = &rt6->rt6i_src;
4642                 rt6_flags = rt6->rt6i_flags;
4643         } else {
4644                 rt6_dst = &rt->fib6_dst;
4645                 rt6_src = &rt->fib6_src;
4646                 rt6_flags = rt->fib6_flags;
4647         }
4648
4649         rtm = nlmsg_data(nlh);
4650         rtm->rtm_family = AF_INET6;
4651         rtm->rtm_dst_len = rt6_dst->plen;
4652         rtm->rtm_src_len = rt6_src->plen;
4653         rtm->rtm_tos = 0;
4654         if (rt->fib6_table)
4655                 table = rt->fib6_table->tb6_id;
4656         else
4657                 table = RT6_TABLE_UNSPEC;
4658         rtm->rtm_table = table;
4659         if (nla_put_u32(skb, RTA_TABLE, table))
4660                 goto nla_put_failure;
4661
4662         rtm->rtm_type = rt->fib6_type;
4663         rtm->rtm_flags = 0;
4664         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4665         rtm->rtm_protocol = rt->fib6_protocol;
4666
4667         if (rt6_flags & RTF_CACHE)
4668                 rtm->rtm_flags |= RTM_F_CLONED;
4669
4670         if (dest) {
4671                 if (nla_put_in6_addr(skb, RTA_DST, dest))
4672                         goto nla_put_failure;
4673                 rtm->rtm_dst_len = 128;
4674         } else if (rtm->rtm_dst_len)
4675                 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4676                         goto nla_put_failure;
4677 #ifdef CONFIG_IPV6_SUBTREES
4678         if (src) {
4679                 if (nla_put_in6_addr(skb, RTA_SRC, src))
4680                         goto nla_put_failure;
4681                 rtm->rtm_src_len = 128;
4682         } else if (rtm->rtm_src_len &&
4683                    nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4684                 goto nla_put_failure;
4685 #endif
4686         if (iif) {
4687 #ifdef CONFIG_IPV6_MROUTE
4688                 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4689                         int err = ip6mr_get_route(net, skb, rtm, portid);
4690
4691                         if (err == 0)
4692                                 return 0;
4693                         if (err < 0)
4694                                 goto nla_put_failure;
4695                 } else
4696 #endif
4697                         if (nla_put_u32(skb, RTA_IIF, iif))
4698                                 goto nla_put_failure;
4699         } else if (dest) {
4700                 struct in6_addr saddr_buf;
4701                 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4702                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4703                         goto nla_put_failure;
4704         }
4705
4706         if (rt->fib6_prefsrc.plen) {
4707                 struct in6_addr saddr_buf;
4708                 saddr_buf = rt->fib6_prefsrc.addr;
4709                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4710                         goto nla_put_failure;
4711         }
4712
4713         pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4714         if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4715                 goto nla_put_failure;
4716
4717         if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4718                 goto nla_put_failure;
4719
4720         /* For multipath routes, walk the siblings list and add
4721          * each as a nexthop within RTA_MULTIPATH.
4722          */
4723         if (rt6) {
4724                 if (rt6_flags & RTF_GATEWAY &&
4725                     nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4726                         goto nla_put_failure;
4727
4728                 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4729                         goto nla_put_failure;
4730         } else if (rt->fib6_nsiblings) {
4731                 struct fib6_info *sibling, *next_sibling;
4732                 struct nlattr *mp;
4733
4734                 mp = nla_nest_start(skb, RTA_MULTIPATH);
4735                 if (!mp)
4736                         goto nla_put_failure;
4737
4738                 if (rt6_add_nexthop(skb, rt) < 0)
4739                         goto nla_put_failure;
4740
4741                 list_for_each_entry_safe(sibling, next_sibling,
4742                                          &rt->fib6_siblings, fib6_siblings) {
4743                         if (rt6_add_nexthop(skb, sibling) < 0)
4744                                 goto nla_put_failure;
4745                 }
4746
4747                 nla_nest_end(skb, mp);
4748         } else {
4749                 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4750                         goto nla_put_failure;
4751         }
4752
4753         if (rt6_flags & RTF_EXPIRES) {
4754                 expires = dst ? dst->expires : rt->expires;
4755                 expires -= jiffies;
4756         }
4757
4758         if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4759                 goto nla_put_failure;
4760
4761         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4762                 goto nla_put_failure;
4763
4764
4765         nlmsg_end(skb, nlh);
4766         return 0;
4767
4768 nla_put_failure:
4769         nlmsg_cancel(skb, nlh);
4770         return -EMSGSIZE;
4771 }
4772
4773 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4774                                const struct net_device *dev)
4775 {
4776         if (f6i->fib6_nh.nh_dev == dev)
4777                 return true;
4778
4779         if (f6i->fib6_nsiblings) {
4780                 struct fib6_info *sibling, *next_sibling;
4781
4782                 list_for_each_entry_safe(sibling, next_sibling,
4783                                          &f6i->fib6_siblings, fib6_siblings) {
4784                         if (sibling->fib6_nh.nh_dev == dev)
4785                                 return true;
4786                 }
4787         }
4788
4789         return false;
4790 }
4791
4792 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4793 {
4794         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4795         struct fib_dump_filter *filter = &arg->filter;
4796         unsigned int flags = NLM_F_MULTI;
4797         struct net *net = arg->net;
4798
4799         if (rt == net->ipv6.fib6_null_entry)
4800                 return 0;
4801
4802         if ((filter->flags & RTM_F_PREFIX) &&
4803             !(rt->fib6_flags & RTF_PREFIX_RT)) {
4804                 /* success since this is not a prefix route */
4805                 return 1;
4806         }
4807         if (filter->filter_set) {
4808                 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4809                     (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4810                     (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4811                         return 1;
4812                 }
4813                 flags |= NLM_F_DUMP_FILTERED;
4814         }
4815
4816         return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4817                              RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4818                              arg->cb->nlh->nlmsg_seq, flags);
4819 }
4820
4821 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4822                               struct netlink_ext_ack *extack)
4823 {
4824         struct net *net = sock_net(in_skb->sk);
4825         struct nlattr *tb[RTA_MAX+1];
4826         int err, iif = 0, oif = 0;
4827         struct fib6_info *from;
4828         struct dst_entry *dst;
4829         struct rt6_info *rt;
4830         struct sk_buff *skb;
4831         struct rtmsg *rtm;
4832         struct flowi6 fl6 = {};
4833         bool fibmatch;
4834
4835         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4836                           extack);
4837         if (err < 0)
4838                 goto errout;
4839
4840         err = -EINVAL;
4841         rtm = nlmsg_data(nlh);
4842         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4843         fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4844
4845         if (tb[RTA_SRC]) {
4846                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4847                         goto errout;
4848
4849                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4850         }
4851
4852         if (tb[RTA_DST]) {
4853                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4854                         goto errout;
4855
4856                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4857         }
4858
4859         if (tb[RTA_IIF])
4860                 iif = nla_get_u32(tb[RTA_IIF]);
4861
4862         if (tb[RTA_OIF])
4863                 oif = nla_get_u32(tb[RTA_OIF]);
4864
4865         if (tb[RTA_MARK])
4866                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4867
4868         if (tb[RTA_UID])
4869                 fl6.flowi6_uid = make_kuid(current_user_ns(),
4870                                            nla_get_u32(tb[RTA_UID]));
4871         else
4872                 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4873
4874         if (tb[RTA_SPORT])
4875                 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4876
4877         if (tb[RTA_DPORT])
4878                 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4879
4880         if (tb[RTA_IP_PROTO]) {
4881                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4882                                                   &fl6.flowi6_proto, extack);
4883                 if (err)
4884                         goto errout;
4885         }
4886
4887         if (iif) {
4888                 struct net_device *dev;
4889                 int flags = 0;
4890
4891                 rcu_read_lock();
4892
4893                 dev = dev_get_by_index_rcu(net, iif);
4894                 if (!dev) {
4895                         rcu_read_unlock();
4896                         err = -ENODEV;
4897                         goto errout;
4898                 }
4899
4900                 fl6.flowi6_iif = iif;
4901
4902                 if (!ipv6_addr_any(&fl6.saddr))
4903                         flags |= RT6_LOOKUP_F_HAS_SADDR;
4904
4905                 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4906
4907                 rcu_read_unlock();
4908         } else {
4909                 fl6.flowi6_oif = oif;
4910
4911                 dst = ip6_route_output(net, NULL, &fl6);
4912         }
4913
4914
4915         rt = container_of(dst, struct rt6_info, dst);
4916         if (rt->dst.error) {
4917                 err = rt->dst.error;
4918                 ip6_rt_put(rt);
4919                 goto errout;
4920         }
4921
4922         if (rt == net->ipv6.ip6_null_entry) {
4923                 err = rt->dst.error;
4924                 ip6_rt_put(rt);
4925                 goto errout;
4926         }
4927
4928         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4929         if (!skb) {
4930                 ip6_rt_put(rt);
4931                 err = -ENOBUFS;
4932                 goto errout;
4933         }
4934
4935         skb_dst_set(skb, &rt->dst);
4936
4937         rcu_read_lock();
4938         from = rcu_dereference(rt->from);
4939
4940         if (fibmatch)
4941                 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4942                                     RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4943                                     nlh->nlmsg_seq, 0);
4944         else
4945                 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4946                                     &fl6.saddr, iif, RTM_NEWROUTE,
4947                                     NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4948                                     0);
4949         rcu_read_unlock();
4950
4951         if (err < 0) {
4952                 kfree_skb(skb);
4953                 goto errout;
4954         }
4955
4956         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4957 errout:
4958         return err;
4959 }
4960
4961 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4962                      unsigned int nlm_flags)
4963 {
4964         struct sk_buff *skb;
4965         struct net *net = info->nl_net;
4966         u32 seq;
4967         int err;
4968
4969         err = -ENOBUFS;
4970         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4971
4972         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4973         if (!skb)
4974                 goto errout;
4975
4976         err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4977                             event, info->portid, seq, nlm_flags);
4978         if (err < 0) {
4979                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4980                 WARN_ON(err == -EMSGSIZE);
4981                 kfree_skb(skb);
4982                 goto errout;
4983         }
4984         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4985                     info->nlh, gfp_any());
4986         return;
4987 errout:
4988         if (err < 0)
4989                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4990 }
4991
4992 static int ip6_route_dev_notify(struct notifier_block *this,
4993                                 unsigned long event, void *ptr)
4994 {
4995         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4996         struct net *net = dev_net(dev);
4997
4998         if (!(dev->flags & IFF_LOOPBACK))
4999                 return NOTIFY_OK;
5000
5001         if (event == NETDEV_REGISTER) {
5002                 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5003                 net->ipv6.ip6_null_entry->dst.dev = dev;
5004                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5005 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5006                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5007                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5008                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5009                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5010 #endif
5011          } else if (event == NETDEV_UNREGISTER &&
5012                     dev->reg_state != NETREG_UNREGISTERED) {
5013                 /* NETDEV_UNREGISTER could be fired for multiple times by
5014                  * netdev_wait_allrefs(). Make sure we only call this once.
5015                  */
5016                 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5017 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5018                 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5019                 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5020 #endif
5021         }
5022
5023         return NOTIFY_OK;
5024 }
5025
5026 /*
5027  *      /proc
5028  */
5029
5030 #ifdef CONFIG_PROC_FS
5031 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5032 {
5033         struct net *net = (struct net *)seq->private;
5034         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5035                    net->ipv6.rt6_stats->fib_nodes,
5036                    net->ipv6.rt6_stats->fib_route_nodes,
5037                    atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5038                    net->ipv6.rt6_stats->fib_rt_entries,
5039                    net->ipv6.rt6_stats->fib_rt_cache,
5040                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5041                    net->ipv6.rt6_stats->fib_discarded_routes);
5042
5043         return 0;
5044 }
5045 #endif  /* CONFIG_PROC_FS */
5046
5047 #ifdef CONFIG_SYSCTL
5048
5049 static
5050 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5051                               void __user *buffer, size_t *lenp, loff_t *ppos)
5052 {
5053         struct net *net;
5054         int delay;
5055         if (!write)
5056                 return -EINVAL;
5057
5058         net = (struct net *)ctl->extra1;
5059         delay = net->ipv6.sysctl.flush_delay;
5060         proc_dointvec(ctl, write, buffer, lenp, ppos);
5061         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5062         return 0;
5063 }
5064
5065 static int zero;
5066 static int one = 1;
5067
5068 static struct ctl_table ipv6_route_table_template[] = {
5069         {
5070                 .procname       =       "flush",
5071                 .data           =       &init_net.ipv6.sysctl.flush_delay,
5072                 .maxlen         =       sizeof(int),
5073                 .mode           =       0200,
5074                 .proc_handler   =       ipv6_sysctl_rtcache_flush
5075         },
5076         {
5077                 .procname       =       "gc_thresh",
5078                 .data           =       &ip6_dst_ops_template.gc_thresh,
5079                 .maxlen         =       sizeof(int),
5080                 .mode           =       0644,
5081                 .proc_handler   =       proc_dointvec,
5082         },
5083         {
5084                 .procname       =       "max_size",
5085                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
5086                 .maxlen         =       sizeof(int),
5087                 .mode           =       0644,
5088                 .proc_handler   =       proc_dointvec,
5089         },
5090         {
5091                 .procname       =       "gc_min_interval",
5092                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5093                 .maxlen         =       sizeof(int),
5094                 .mode           =       0644,
5095                 .proc_handler   =       proc_dointvec_jiffies,
5096         },
5097         {
5098                 .procname       =       "gc_timeout",
5099                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5100                 .maxlen         =       sizeof(int),
5101                 .mode           =       0644,
5102                 .proc_handler   =       proc_dointvec_jiffies,
5103         },
5104         {
5105                 .procname       =       "gc_interval",
5106                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5107                 .maxlen         =       sizeof(int),
5108                 .mode           =       0644,
5109                 .proc_handler   =       proc_dointvec_jiffies,
5110         },
5111         {
5112                 .procname       =       "gc_elasticity",
5113                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5114                 .maxlen         =       sizeof(int),
5115                 .mode           =       0644,
5116                 .proc_handler   =       proc_dointvec,
5117         },
5118         {
5119                 .procname       =       "mtu_expires",
5120                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5121                 .maxlen         =       sizeof(int),
5122                 .mode           =       0644,
5123                 .proc_handler   =       proc_dointvec_jiffies,
5124         },
5125         {
5126                 .procname       =       "min_adv_mss",
5127                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5128                 .maxlen         =       sizeof(int),
5129                 .mode           =       0644,
5130                 .proc_handler   =       proc_dointvec,
5131         },
5132         {
5133                 .procname       =       "gc_min_interval_ms",
5134                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5135                 .maxlen         =       sizeof(int),
5136                 .mode           =       0644,
5137                 .proc_handler   =       proc_dointvec_ms_jiffies,
5138         },
5139         {
5140                 .procname       =       "skip_notify_on_dev_down",
5141                 .data           =       &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5142                 .maxlen         =       sizeof(int),
5143                 .mode           =       0644,
5144                 .proc_handler   =       proc_dointvec,
5145                 .extra1         =       &zero,
5146                 .extra2         =       &one,
5147         },
5148         { }
5149 };
5150
5151 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5152 {
5153         struct ctl_table *table;
5154
5155         table = kmemdup(ipv6_route_table_template,
5156                         sizeof(ipv6_route_table_template),
5157                         GFP_KERNEL);
5158
5159         if (table) {
5160                 table[0].data = &net->ipv6.sysctl.flush_delay;
5161                 table[0].extra1 = net;
5162                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5163                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5164                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5165                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5166                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5167                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5168                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5169                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5170                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5171                 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5172
5173                 /* Don't export sysctls to unprivileged users */
5174                 if (net->user_ns != &init_user_ns)
5175                         table[0].procname = NULL;
5176         }
5177
5178         return table;
5179 }
5180 #endif
5181
5182 static int __net_init ip6_route_net_init(struct net *net)
5183 {
5184         int ret = -ENOMEM;
5185
5186         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5187                sizeof(net->ipv6.ip6_dst_ops));
5188
5189         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5190                 goto out_ip6_dst_ops;
5191
5192         net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5193                                             sizeof(*net->ipv6.fib6_null_entry),
5194                                             GFP_KERNEL);
5195         if (!net->ipv6.fib6_null_entry)
5196                 goto out_ip6_dst_entries;
5197
5198         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5199                                            sizeof(*net->ipv6.ip6_null_entry),
5200                                            GFP_KERNEL);
5201         if (!net->ipv6.ip6_null_entry)
5202                 goto out_fib6_null_entry;
5203         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5204         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5205                          ip6_template_metrics, true);
5206
5207 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5208         net->ipv6.fib6_has_custom_rules = false;
5209         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5210                                                sizeof(*net->ipv6.ip6_prohibit_entry),
5211                                                GFP_KERNEL);
5212         if (!net->ipv6.ip6_prohibit_entry)
5213                 goto out_ip6_null_entry;
5214         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5215         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5216                          ip6_template_metrics, true);
5217
5218         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5219                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
5220                                                GFP_KERNEL);
5221         if (!net->ipv6.ip6_blk_hole_entry)
5222                 goto out_ip6_prohibit_entry;
5223         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5224         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5225                          ip6_template_metrics, true);
5226 #endif
5227
5228         net->ipv6.sysctl.flush_delay = 0;
5229         net->ipv6.sysctl.ip6_rt_max_size = 4096;
5230         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5231         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5232         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5233         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5234         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5235         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5236         net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5237
5238         net->ipv6.ip6_rt_gc_expire = 30*HZ;
5239
5240         ret = 0;
5241 out:
5242         return ret;
5243
5244 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5245 out_ip6_prohibit_entry:
5246         kfree(net->ipv6.ip6_prohibit_entry);
5247 out_ip6_null_entry:
5248         kfree(net->ipv6.ip6_null_entry);
5249 #endif
5250 out_fib6_null_entry:
5251         kfree(net->ipv6.fib6_null_entry);
5252 out_ip6_dst_entries:
5253         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5254 out_ip6_dst_ops:
5255         goto out;
5256 }
5257
5258 static void __net_exit ip6_route_net_exit(struct net *net)
5259 {
5260         kfree(net->ipv6.fib6_null_entry);
5261         kfree(net->ipv6.ip6_null_entry);
5262 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5263         kfree(net->ipv6.ip6_prohibit_entry);
5264         kfree(net->ipv6.ip6_blk_hole_entry);
5265 #endif
5266         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5267 }
5268
5269 static int __net_init ip6_route_net_init_late(struct net *net)
5270 {
5271 #ifdef CONFIG_PROC_FS
5272         proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5273                         sizeof(struct ipv6_route_iter));
5274         proc_create_net_single("rt6_stats", 0444, net->proc_net,
5275                         rt6_stats_seq_show, NULL);
5276 #endif
5277         return 0;
5278 }
5279
5280 static void __net_exit ip6_route_net_exit_late(struct net *net)
5281 {
5282 #ifdef CONFIG_PROC_FS
5283         remove_proc_entry("ipv6_route", net->proc_net);
5284         remove_proc_entry("rt6_stats", net->proc_net);
5285 #endif
5286 }
5287
5288 static struct pernet_operations ip6_route_net_ops = {
5289         .init = ip6_route_net_init,
5290         .exit = ip6_route_net_exit,
5291 };
5292
5293 static int __net_init ipv6_inetpeer_init(struct net *net)
5294 {
5295         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5296
5297         if (!bp)
5298                 return -ENOMEM;
5299         inet_peer_base_init(bp);
5300         net->ipv6.peers = bp;
5301         return 0;
5302 }
5303
5304 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5305 {
5306         struct inet_peer_base *bp = net->ipv6.peers;
5307
5308         net->ipv6.peers = NULL;
5309         inetpeer_invalidate_tree(bp);
5310         kfree(bp);
5311 }
5312
5313 static struct pernet_operations ipv6_inetpeer_ops = {
5314         .init   =       ipv6_inetpeer_init,
5315         .exit   =       ipv6_inetpeer_exit,
5316 };
5317
5318 static struct pernet_operations ip6_route_net_late_ops = {
5319         .init = ip6_route_net_init_late,
5320         .exit = ip6_route_net_exit_late,
5321 };
5322
5323 static struct notifier_block ip6_route_dev_notifier = {
5324         .notifier_call = ip6_route_dev_notify,
5325         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5326 };
5327
5328 void __init ip6_route_init_special_entries(void)
5329 {
5330         /* Registering of the loopback is done before this portion of code,
5331          * the loopback reference in rt6_info will not be taken, do it
5332          * manually for init_net */
5333         init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5334         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5335         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5336   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5337         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5338         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5339         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5340         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5341   #endif
5342 }
5343
5344 int __init ip6_route_init(void)
5345 {
5346         int ret;
5347         int cpu;
5348
5349         ret = -ENOMEM;
5350         ip6_dst_ops_template.kmem_cachep =
5351                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5352                                   SLAB_HWCACHE_ALIGN, NULL);
5353         if (!ip6_dst_ops_template.kmem_cachep)
5354                 goto out;
5355
5356         ret = dst_entries_init(&ip6_dst_blackhole_ops);
5357         if (ret)
5358                 goto out_kmem_cache;
5359
5360         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5361         if (ret)
5362                 goto out_dst_entries;
5363
5364         ret = register_pernet_subsys(&ip6_route_net_ops);
5365         if (ret)
5366                 goto out_register_inetpeer;
5367
5368         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5369
5370         ret = fib6_init();
5371         if (ret)
5372                 goto out_register_subsys;
5373
5374         ret = xfrm6_init();
5375         if (ret)
5376                 goto out_fib6_init;
5377
5378         ret = fib6_rules_init();
5379         if (ret)
5380                 goto xfrm6_init;
5381
5382         ret = register_pernet_subsys(&ip6_route_net_late_ops);
5383         if (ret)
5384                 goto fib6_rules_init;
5385
5386         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5387                                    inet6_rtm_newroute, NULL, 0);
5388         if (ret < 0)
5389                 goto out_register_late_subsys;
5390
5391         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5392                                    inet6_rtm_delroute, NULL, 0);
5393         if (ret < 0)
5394                 goto out_register_late_subsys;
5395
5396         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5397                                    inet6_rtm_getroute, NULL,
5398                                    RTNL_FLAG_DOIT_UNLOCKED);
5399         if (ret < 0)
5400                 goto out_register_late_subsys;
5401
5402         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5403         if (ret)
5404                 goto out_register_late_subsys;
5405
5406         for_each_possible_cpu(cpu) {
5407                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5408
5409                 INIT_LIST_HEAD(&ul->head);
5410                 spin_lock_init(&ul->lock);
5411         }
5412
5413 out:
5414         return ret;
5415
5416 out_register_late_subsys:
5417         rtnl_unregister_all(PF_INET6);
5418         unregister_pernet_subsys(&ip6_route_net_late_ops);
5419 fib6_rules_init:
5420         fib6_rules_cleanup();
5421 xfrm6_init:
5422         xfrm6_fini();
5423 out_fib6_init:
5424         fib6_gc_cleanup();
5425 out_register_subsys:
5426         unregister_pernet_subsys(&ip6_route_net_ops);
5427 out_register_inetpeer:
5428         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5429 out_dst_entries:
5430         dst_entries_destroy(&ip6_dst_blackhole_ops);
5431 out_kmem_cache:
5432         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5433         goto out;
5434 }
5435
5436 void ip6_route_cleanup(void)
5437 {
5438         unregister_netdevice_notifier(&ip6_route_dev_notifier);
5439         unregister_pernet_subsys(&ip6_route_net_late_ops);
5440         fib6_rules_cleanup();
5441         xfrm6_fini();
5442         fib6_gc_cleanup();
5443         unregister_pernet_subsys(&ipv6_inetpeer_ops);
5444         unregister_pernet_subsys(&ip6_route_net_ops);
5445         dst_entries_destroy(&ip6_dst_blackhole_ops);
5446         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5447 }
This page took 0.321165 seconds and 4 git commands to generate.