2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
62 #include <asm/uaccess.h>
65 #include <linux/sysctl.h>
68 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
69 const struct in6_addr *dest);
70 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
71 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
72 static unsigned int ip6_mtu(const struct dst_entry *dst);
73 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
74 static void ip6_dst_destroy(struct dst_entry *);
75 static void ip6_dst_ifdown(struct dst_entry *,
76 struct net_device *dev, int how);
77 static int ip6_dst_gc(struct dst_ops *ops);
79 static int ip6_pkt_discard(struct sk_buff *skb);
80 static int ip6_pkt_discard_out(struct sk_buff *skb);
81 static void ip6_link_failure(struct sk_buff *skb);
82 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
83 struct sk_buff *skb, u32 mtu);
84 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
86 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
88 #ifdef CONFIG_IPV6_ROUTE_INFO
89 static struct rt6_info *rt6_add_route_info(struct net *net,
90 const struct in6_addr *prefix, int prefixlen,
91 const struct in6_addr *gwaddr, int ifindex,
93 static struct rt6_info *rt6_get_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex);
98 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
100 struct rt6_info *rt = (struct rt6_info *) dst;
101 struct inet_peer *peer;
104 if (!(rt->dst.flags & DST_HOST))
107 peer = rt6_get_peer_create(rt);
109 u32 *old_p = __DST_METRICS_PTR(old);
110 unsigned long prev, new;
113 if (inet_metrics_new(peer))
114 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
116 new = (unsigned long) p;
117 prev = cmpxchg(&dst->_metrics, old, new);
120 p = __DST_METRICS_PTR(prev);
121 if (prev & DST_METRICS_READ_ONLY)
128 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
132 struct in6_addr *p = &rt->rt6i_gateway;
134 if (!ipv6_addr_any(p))
135 return (const void *) p;
137 return &ipv6_hdr(skb)->daddr;
141 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
145 struct rt6_info *rt = (struct rt6_info *) dst;
148 daddr = choose_neigh_daddr(rt, skb, daddr);
149 n = __ipv6_neigh_lookup(dst->dev, daddr);
152 return neigh_create(&nd_tbl, daddr, dst->dev);
155 static struct dst_ops ip6_dst_ops_template = {
157 .protocol = cpu_to_be16(ETH_P_IPV6),
160 .check = ip6_dst_check,
161 .default_advmss = ip6_default_advmss,
163 .cow_metrics = ipv6_cow_metrics,
164 .destroy = ip6_dst_destroy,
165 .ifdown = ip6_dst_ifdown,
166 .negative_advice = ip6_negative_advice,
167 .link_failure = ip6_link_failure,
168 .update_pmtu = ip6_rt_update_pmtu,
169 .redirect = rt6_do_redirect,
170 .local_out = __ip6_local_out,
171 .neigh_lookup = ip6_neigh_lookup,
174 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
176 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
178 return mtu ? : dst->dev->mtu;
181 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
182 struct sk_buff *skb, u32 mtu)
186 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
191 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
197 static struct dst_ops ip6_dst_blackhole_ops = {
199 .protocol = cpu_to_be16(ETH_P_IPV6),
200 .destroy = ip6_dst_destroy,
201 .check = ip6_dst_check,
202 .mtu = ip6_blackhole_mtu,
203 .default_advmss = ip6_default_advmss,
204 .update_pmtu = ip6_rt_blackhole_update_pmtu,
205 .redirect = ip6_rt_blackhole_redirect,
206 .cow_metrics = ip6_rt_blackhole_cow_metrics,
207 .neigh_lookup = ip6_neigh_lookup,
210 static const u32 ip6_template_metrics[RTAX_MAX] = {
211 [RTAX_HOPLIMIT - 1] = 0,
214 static const struct rt6_info ip6_null_entry_template = {
216 .__refcnt = ATOMIC_INIT(1),
218 .obsolete = DST_OBSOLETE_FORCE_CHK,
219 .error = -ENETUNREACH,
220 .input = ip6_pkt_discard,
221 .output = ip6_pkt_discard_out,
223 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
224 .rt6i_protocol = RTPROT_KERNEL,
225 .rt6i_metric = ~(u32) 0,
226 .rt6i_ref = ATOMIC_INIT(1),
229 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
231 static int ip6_pkt_prohibit(struct sk_buff *skb);
232 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
234 static const struct rt6_info ip6_prohibit_entry_template = {
236 .__refcnt = ATOMIC_INIT(1),
238 .obsolete = DST_OBSOLETE_FORCE_CHK,
240 .input = ip6_pkt_prohibit,
241 .output = ip6_pkt_prohibit_out,
243 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
244 .rt6i_protocol = RTPROT_KERNEL,
245 .rt6i_metric = ~(u32) 0,
246 .rt6i_ref = ATOMIC_INIT(1),
249 static const struct rt6_info ip6_blk_hole_entry_template = {
251 .__refcnt = ATOMIC_INIT(1),
253 .obsolete = DST_OBSOLETE_FORCE_CHK,
255 .input = dst_discard,
256 .output = dst_discard,
258 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
259 .rt6i_protocol = RTPROT_KERNEL,
260 .rt6i_metric = ~(u32) 0,
261 .rt6i_ref = ATOMIC_INIT(1),
266 /* allocate dst with ip6_dst_ops */
267 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
268 struct net_device *dev,
270 struct fib6_table *table)
272 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
273 0, DST_OBSOLETE_FORCE_CHK, flags);
276 struct dst_entry *dst = &rt->dst;
278 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
279 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
280 rt->rt6i_genid = rt_genid(net);
281 INIT_LIST_HEAD(&rt->rt6i_siblings);
282 rt->rt6i_nsiblings = 0;
287 static void ip6_dst_destroy(struct dst_entry *dst)
289 struct rt6_info *rt = (struct rt6_info *)dst;
290 struct inet6_dev *idev = rt->rt6i_idev;
291 struct dst_entry *from = dst->from;
293 if (!(rt->dst.flags & DST_HOST))
294 dst_destroy_metrics_generic(dst);
297 rt->rt6i_idev = NULL;
304 if (rt6_has_peer(rt)) {
305 struct inet_peer *peer = rt6_peer_ptr(rt);
310 void rt6_bind_peer(struct rt6_info *rt, int create)
312 struct inet_peer_base *base;
313 struct inet_peer *peer;
315 base = inetpeer_base_ptr(rt->_rt6i_peer);
319 peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
321 if (!rt6_set_peer(rt, peer))
326 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
329 struct rt6_info *rt = (struct rt6_info *)dst;
330 struct inet6_dev *idev = rt->rt6i_idev;
331 struct net_device *loopback_dev =
332 dev_net(dev)->loopback_dev;
334 if (dev != loopback_dev) {
335 if (idev && idev->dev == dev) {
336 struct inet6_dev *loopback_idev =
337 in6_dev_get(loopback_dev);
339 rt->rt6i_idev = loopback_idev;
346 static bool rt6_check_expired(const struct rt6_info *rt)
348 if (rt->rt6i_flags & RTF_EXPIRES) {
349 if (time_after(jiffies, rt->dst.expires))
351 } else if (rt->dst.from) {
352 return rt6_check_expired((struct rt6_info *) rt->dst.from);
357 static bool rt6_need_strict(const struct in6_addr *daddr)
359 return ipv6_addr_type(daddr) &
360 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
363 /* Multipath route selection:
364 * Hash based function using packet header and flowlabel.
365 * Adapted from fib_info_hashfn()
367 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
368 const struct flowi6 *fl6)
370 unsigned int val = fl6->flowi6_proto;
372 val ^= ipv6_addr_hash(&fl6->daddr);
373 val ^= ipv6_addr_hash(&fl6->saddr);
375 /* Work only if this not encapsulated */
376 switch (fl6->flowi6_proto) {
380 val ^= (__force u16)fl6->fl6_sport;
381 val ^= (__force u16)fl6->fl6_dport;
385 val ^= (__force u16)fl6->fl6_icmp_type;
386 val ^= (__force u16)fl6->fl6_icmp_code;
389 /* RFC6438 recommands to use flowlabel */
390 val ^= (__force u32)fl6->flowlabel;
392 /* Perhaps, we need to tune, this function? */
393 val = val ^ (val >> 7) ^ (val >> 12);
394 return val % candidate_count;
397 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
398 struct flowi6 *fl6, int oif,
401 struct rt6_info *sibling, *next_sibling;
404 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
405 /* Don't change the route, if route_choosen == 0
406 * (siblings does not include ourself)
409 list_for_each_entry_safe(sibling, next_sibling,
410 &match->rt6i_siblings, rt6i_siblings) {
412 if (route_choosen == 0) {
413 if (rt6_score_route(sibling, oif, strict) < 0)
423 * Route lookup. Any table->tb6_lock is implied.
426 static inline struct rt6_info *rt6_device_match(struct net *net,
428 const struct in6_addr *saddr,
432 struct rt6_info *local = NULL;
433 struct rt6_info *sprt;
435 if (!oif && ipv6_addr_any(saddr))
438 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
439 struct net_device *dev = sprt->dst.dev;
442 if (dev->ifindex == oif)
444 if (dev->flags & IFF_LOOPBACK) {
445 if (!sprt->rt6i_idev ||
446 sprt->rt6i_idev->dev->ifindex != oif) {
447 if (flags & RT6_LOOKUP_F_IFACE && oif)
449 if (local && (!oif ||
450 local->rt6i_idev->dev->ifindex == oif))
456 if (ipv6_chk_addr(net, saddr, dev,
457 flags & RT6_LOOKUP_F_IFACE))
466 if (flags & RT6_LOOKUP_F_IFACE)
467 return net->ipv6.ip6_null_entry;
473 #ifdef CONFIG_IPV6_ROUTER_PREF
474 static void rt6_probe(struct rt6_info *rt)
476 struct neighbour *neigh;
478 * Okay, this does not seem to be appropriate
479 * for now, however, we need to check if it
480 * is really so; aka Router Reachability Probing.
482 * Router Reachability Probe MUST be rate-limited
483 * to no more than one per minute.
485 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
488 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
490 write_lock(&neigh->lock);
491 if (neigh->nud_state & NUD_VALID)
496 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
497 struct in6_addr mcaddr;
498 struct in6_addr *target;
501 neigh->updated = jiffies;
502 write_unlock(&neigh->lock);
505 target = (struct in6_addr *)&rt->rt6i_gateway;
506 addrconf_addr_solict_mult(target, &mcaddr);
507 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
510 write_unlock(&neigh->lock);
512 rcu_read_unlock_bh();
515 static inline void rt6_probe(struct rt6_info *rt)
521 * Default Router Selection (RFC 2461 6.3.6)
523 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
525 struct net_device *dev = rt->dst.dev;
526 if (!oif || dev->ifindex == oif)
528 if ((dev->flags & IFF_LOOPBACK) &&
529 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
534 static inline bool rt6_check_neigh(struct rt6_info *rt)
536 struct neighbour *neigh;
539 if (rt->rt6i_flags & RTF_NONEXTHOP ||
540 !(rt->rt6i_flags & RTF_GATEWAY))
544 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
546 read_lock(&neigh->lock);
547 if (neigh->nud_state & NUD_VALID)
549 #ifdef CONFIG_IPV6_ROUTER_PREF
550 else if (!(neigh->nud_state & NUD_FAILED))
553 read_unlock(&neigh->lock);
555 rcu_read_unlock_bh();
560 static int rt6_score_route(struct rt6_info *rt, int oif,
565 m = rt6_check_dev(rt, oif);
566 if (!m && (strict & RT6_LOOKUP_F_IFACE))
568 #ifdef CONFIG_IPV6_ROUTER_PREF
569 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
571 if (!rt6_check_neigh(rt) && (strict & RT6_LOOKUP_F_REACHABLE))
576 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
577 int *mpri, struct rt6_info *match)
581 if (rt6_check_expired(rt))
584 m = rt6_score_route(rt, oif, strict);
589 if (strict & RT6_LOOKUP_F_REACHABLE)
593 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
601 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
602 struct rt6_info *rr_head,
603 u32 metric, int oif, int strict)
605 struct rt6_info *rt, *match;
609 for (rt = rr_head; rt && rt->rt6i_metric == metric;
610 rt = rt->dst.rt6_next)
611 match = find_match(rt, oif, strict, &mpri, match);
612 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
613 rt = rt->dst.rt6_next)
614 match = find_match(rt, oif, strict, &mpri, match);
619 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
621 struct rt6_info *match, *rt0;
626 fn->rr_ptr = rt0 = fn->leaf;
628 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
631 (strict & RT6_LOOKUP_F_REACHABLE)) {
632 struct rt6_info *next = rt0->dst.rt6_next;
634 /* no entries matched; do round-robin */
635 if (!next || next->rt6i_metric != rt0->rt6i_metric)
642 net = dev_net(rt0->dst.dev);
643 return match ? match : net->ipv6.ip6_null_entry;
646 #ifdef CONFIG_IPV6_ROUTE_INFO
647 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
648 const struct in6_addr *gwaddr)
650 struct net *net = dev_net(dev);
651 struct route_info *rinfo = (struct route_info *) opt;
652 struct in6_addr prefix_buf, *prefix;
654 unsigned long lifetime;
657 if (len < sizeof(struct route_info)) {
661 /* Sanity check for prefix_len and length */
662 if (rinfo->length > 3) {
664 } else if (rinfo->prefix_len > 128) {
666 } else if (rinfo->prefix_len > 64) {
667 if (rinfo->length < 2) {
670 } else if (rinfo->prefix_len > 0) {
671 if (rinfo->length < 1) {
676 pref = rinfo->route_pref;
677 if (pref == ICMPV6_ROUTER_PREF_INVALID)
680 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
682 if (rinfo->length == 3)
683 prefix = (struct in6_addr *)rinfo->prefix;
685 /* this function is safe */
686 ipv6_addr_prefix(&prefix_buf,
687 (struct in6_addr *)rinfo->prefix,
689 prefix = &prefix_buf;
692 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
695 if (rt && !lifetime) {
701 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
704 rt->rt6i_flags = RTF_ROUTEINFO |
705 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
708 if (!addrconf_finite_timeout(lifetime))
709 rt6_clean_expires(rt);
711 rt6_set_expires(rt, jiffies + HZ * lifetime);
719 #define BACKTRACK(__net, saddr) \
721 if (rt == __net->ipv6.ip6_null_entry) { \
722 struct fib6_node *pn; \
724 if (fn->fn_flags & RTN_TL_ROOT) \
727 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
728 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
731 if (fn->fn_flags & RTN_RTINFO) \
737 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
738 struct fib6_table *table,
739 struct flowi6 *fl6, int flags)
741 struct fib6_node *fn;
744 read_lock_bh(&table->tb6_lock);
745 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
748 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
749 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
750 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
751 BACKTRACK(net, &fl6->saddr);
753 dst_use(&rt->dst, jiffies);
754 read_unlock_bh(&table->tb6_lock);
759 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
762 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
764 EXPORT_SYMBOL_GPL(ip6_route_lookup);
766 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
767 const struct in6_addr *saddr, int oif, int strict)
769 struct flowi6 fl6 = {
773 struct dst_entry *dst;
774 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
777 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
778 flags |= RT6_LOOKUP_F_HAS_SADDR;
781 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
783 return (struct rt6_info *) dst;
790 EXPORT_SYMBOL(rt6_lookup);
792 /* ip6_ins_rt is called with FREE table->tb6_lock.
793 It takes new route entry, the addition fails by any reason the
794 route is freed. In any case, if caller does not hold it, it may
798 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
801 struct fib6_table *table;
803 table = rt->rt6i_table;
804 write_lock_bh(&table->tb6_lock);
805 err = fib6_add(&table->tb6_root, rt, info);
806 write_unlock_bh(&table->tb6_lock);
811 int ip6_ins_rt(struct rt6_info *rt)
813 struct nl_info info = {
814 .nl_net = dev_net(rt->dst.dev),
816 return __ip6_ins_rt(rt, &info);
819 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
820 const struct in6_addr *daddr,
821 const struct in6_addr *saddr)
829 rt = ip6_rt_copy(ort, daddr);
832 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
833 if (ort->rt6i_dst.plen != 128 &&
834 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
835 rt->rt6i_flags |= RTF_ANYCAST;
836 rt->rt6i_gateway = *daddr;
839 rt->rt6i_flags |= RTF_CACHE;
841 #ifdef CONFIG_IPV6_SUBTREES
842 if (rt->rt6i_src.plen && saddr) {
843 rt->rt6i_src.addr = *saddr;
844 rt->rt6i_src.plen = 128;
852 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
853 const struct in6_addr *daddr)
855 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
858 rt->rt6i_flags |= RTF_CACHE;
862 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
863 struct flowi6 *fl6, int flags)
865 struct fib6_node *fn;
866 struct rt6_info *rt, *nrt;
870 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
872 strict |= flags & RT6_LOOKUP_F_IFACE;
875 read_lock_bh(&table->tb6_lock);
878 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
881 rt = rt6_select(fn, oif, strict | reachable);
882 if (rt->rt6i_nsiblings)
883 rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
884 BACKTRACK(net, &fl6->saddr);
885 if (rt == net->ipv6.ip6_null_entry ||
886 rt->rt6i_flags & RTF_CACHE)
890 read_unlock_bh(&table->tb6_lock);
892 if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
893 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
894 else if (!(rt->dst.flags & DST_HOST))
895 nrt = rt6_alloc_clone(rt, &fl6->daddr);
900 rt = nrt ? : net->ipv6.ip6_null_entry;
904 err = ip6_ins_rt(nrt);
913 * Race condition! In the gap, when table->tb6_lock was
914 * released someone could insert this route. Relookup.
925 read_unlock_bh(&table->tb6_lock);
927 rt->dst.lastuse = jiffies;
933 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
934 struct flowi6 *fl6, int flags)
936 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
939 static struct dst_entry *ip6_route_input_lookup(struct net *net,
940 struct net_device *dev,
941 struct flowi6 *fl6, int flags)
943 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
944 flags |= RT6_LOOKUP_F_IFACE;
946 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
949 void ip6_route_input(struct sk_buff *skb)
951 const struct ipv6hdr *iph = ipv6_hdr(skb);
952 struct net *net = dev_net(skb->dev);
953 int flags = RT6_LOOKUP_F_HAS_SADDR;
954 struct flowi6 fl6 = {
955 .flowi6_iif = skb->dev->ifindex,
958 .flowlabel = ip6_flowinfo(iph),
959 .flowi6_mark = skb->mark,
960 .flowi6_proto = iph->nexthdr,
963 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
966 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
967 struct flowi6 *fl6, int flags)
969 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
972 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
977 fl6->flowi6_iif = LOOPBACK_IFINDEX;
979 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
980 flags |= RT6_LOOKUP_F_IFACE;
982 if (!ipv6_addr_any(&fl6->saddr))
983 flags |= RT6_LOOKUP_F_HAS_SADDR;
985 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
987 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
990 EXPORT_SYMBOL(ip6_route_output);
992 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
994 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
995 struct dst_entry *new = NULL;
997 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1001 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1002 rt6_init_peer(rt, net->ipv6.peers);
1005 new->input = dst_discard;
1006 new->output = dst_discard;
1008 if (dst_metrics_read_only(&ort->dst))
1009 new->_metrics = ort->dst._metrics;
1011 dst_copy_metrics(new, &ort->dst);
1012 rt->rt6i_idev = ort->rt6i_idev;
1014 in6_dev_hold(rt->rt6i_idev);
1016 rt->rt6i_gateway = ort->rt6i_gateway;
1017 rt->rt6i_flags = ort->rt6i_flags;
1018 rt->rt6i_metric = 0;
1020 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1021 #ifdef CONFIG_IPV6_SUBTREES
1022 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1028 dst_release(dst_orig);
1029 return new ? new : ERR_PTR(-ENOMEM);
1033 * Destination cache support functions
1036 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1038 struct rt6_info *rt;
1040 rt = (struct rt6_info *) dst;
1042 /* All IPV6 dsts are created with ->obsolete set to the value
1043 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1044 * into this function always.
1046 if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev)))
1049 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
1055 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1057 struct rt6_info *rt = (struct rt6_info *) dst;
1060 if (rt->rt6i_flags & RTF_CACHE) {
1061 if (rt6_check_expired(rt)) {
1073 static void ip6_link_failure(struct sk_buff *skb)
1075 struct rt6_info *rt;
1077 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1079 rt = (struct rt6_info *) skb_dst(skb);
1081 if (rt->rt6i_flags & RTF_CACHE)
1082 rt6_update_expires(rt, 0);
1083 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1084 rt->rt6i_node->fn_sernum = -1;
1088 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1089 struct sk_buff *skb, u32 mtu)
1091 struct rt6_info *rt6 = (struct rt6_info*)dst;
1094 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1095 struct net *net = dev_net(dst->dev);
1097 rt6->rt6i_flags |= RTF_MODIFIED;
1098 if (mtu < IPV6_MIN_MTU) {
1099 u32 features = dst_metric(dst, RTAX_FEATURES);
1101 features |= RTAX_FEATURE_ALLFRAG;
1102 dst_metric_set(dst, RTAX_FEATURES, features);
1104 dst_metric_set(dst, RTAX_MTU, mtu);
1105 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1109 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1112 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1113 struct dst_entry *dst;
1116 memset(&fl6, 0, sizeof(fl6));
1117 fl6.flowi6_oif = oif;
1118 fl6.flowi6_mark = mark;
1119 fl6.flowi6_flags = 0;
1120 fl6.daddr = iph->daddr;
1121 fl6.saddr = iph->saddr;
1122 fl6.flowlabel = ip6_flowinfo(iph);
1124 dst = ip6_route_output(net, NULL, &fl6);
1126 ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
1129 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1131 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1133 ip6_update_pmtu(skb, sock_net(sk), mtu,
1134 sk->sk_bound_dev_if, sk->sk_mark);
1136 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1138 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1140 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1141 struct dst_entry *dst;
1144 memset(&fl6, 0, sizeof(fl6));
1145 fl6.flowi6_oif = oif;
1146 fl6.flowi6_mark = mark;
1147 fl6.flowi6_flags = 0;
1148 fl6.daddr = iph->daddr;
1149 fl6.saddr = iph->saddr;
1150 fl6.flowlabel = ip6_flowinfo(iph);
1152 dst = ip6_route_output(net, NULL, &fl6);
1154 rt6_do_redirect(dst, NULL, skb);
1157 EXPORT_SYMBOL_GPL(ip6_redirect);
1159 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1161 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1163 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1165 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1167 struct net_device *dev = dst->dev;
1168 unsigned int mtu = dst_mtu(dst);
1169 struct net *net = dev_net(dev);
1171 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1173 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1174 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1177 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1178 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1179 * IPV6_MAXPLEN is also valid and means: "any MSS,
1180 * rely only on pmtu discovery"
1182 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1187 static unsigned int ip6_mtu(const struct dst_entry *dst)
1189 struct inet6_dev *idev;
1190 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1198 idev = __in6_dev_get(dst->dev);
1200 mtu = idev->cnf.mtu6;
1206 static struct dst_entry *icmp6_dst_gc_list;
1207 static DEFINE_SPINLOCK(icmp6_dst_lock);
1209 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1212 struct dst_entry *dst;
1213 struct rt6_info *rt;
1214 struct inet6_dev *idev = in6_dev_get(dev);
1215 struct net *net = dev_net(dev);
1217 if (unlikely(!idev))
1218 return ERR_PTR(-ENODEV);
1220 rt = ip6_dst_alloc(net, dev, 0, NULL);
1221 if (unlikely(!rt)) {
1223 dst = ERR_PTR(-ENOMEM);
1227 rt->dst.flags |= DST_HOST;
1228 rt->dst.output = ip6_output;
1229 atomic_set(&rt->dst.__refcnt, 1);
1230 rt->rt6i_dst.addr = fl6->daddr;
1231 rt->rt6i_dst.plen = 128;
1232 rt->rt6i_idev = idev;
1233 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1235 spin_lock_bh(&icmp6_dst_lock);
1236 rt->dst.next = icmp6_dst_gc_list;
1237 icmp6_dst_gc_list = &rt->dst;
1238 spin_unlock_bh(&icmp6_dst_lock);
1240 fib6_force_start_gc(net);
1242 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1248 int icmp6_dst_gc(void)
1250 struct dst_entry *dst, **pprev;
1253 spin_lock_bh(&icmp6_dst_lock);
1254 pprev = &icmp6_dst_gc_list;
1256 while ((dst = *pprev) != NULL) {
1257 if (!atomic_read(&dst->__refcnt)) {
1266 spin_unlock_bh(&icmp6_dst_lock);
1271 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1274 struct dst_entry *dst, **pprev;
1276 spin_lock_bh(&icmp6_dst_lock);
1277 pprev = &icmp6_dst_gc_list;
1278 while ((dst = *pprev) != NULL) {
1279 struct rt6_info *rt = (struct rt6_info *) dst;
1280 if (func(rt, arg)) {
1287 spin_unlock_bh(&icmp6_dst_lock);
1290 static int ip6_dst_gc(struct dst_ops *ops)
1292 unsigned long now = jiffies;
1293 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1294 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1295 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1296 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1297 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1298 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1301 entries = dst_entries_get_fast(ops);
1302 if (time_after(rt_last_gc + rt_min_interval, now) &&
1303 entries <= rt_max_size)
1306 net->ipv6.ip6_rt_gc_expire++;
1307 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1308 net->ipv6.ip6_rt_last_gc = now;
1309 entries = dst_entries_get_slow(ops);
1310 if (entries < ops->gc_thresh)
1311 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1313 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1314 return entries > rt_max_size;
1317 int ip6_dst_hoplimit(struct dst_entry *dst)
1319 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1320 if (hoplimit == 0) {
1321 struct net_device *dev = dst->dev;
1322 struct inet6_dev *idev;
1325 idev = __in6_dev_get(dev);
1327 hoplimit = idev->cnf.hop_limit;
1329 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1334 EXPORT_SYMBOL(ip6_dst_hoplimit);
1340 int ip6_route_add(struct fib6_config *cfg)
1343 struct net *net = cfg->fc_nlinfo.nl_net;
1344 struct rt6_info *rt = NULL;
1345 struct net_device *dev = NULL;
1346 struct inet6_dev *idev = NULL;
1347 struct fib6_table *table;
1350 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1352 #ifndef CONFIG_IPV6_SUBTREES
1353 if (cfg->fc_src_len)
1356 if (cfg->fc_ifindex) {
1358 dev = dev_get_by_index(net, cfg->fc_ifindex);
1361 idev = in6_dev_get(dev);
1366 if (cfg->fc_metric == 0)
1367 cfg->fc_metric = IP6_RT_PRIO_USER;
1370 if (cfg->fc_nlinfo.nlh &&
1371 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1372 table = fib6_get_table(net, cfg->fc_table);
1374 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1375 table = fib6_new_table(net, cfg->fc_table);
1378 table = fib6_new_table(net, cfg->fc_table);
1384 rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1391 if (cfg->fc_flags & RTF_EXPIRES)
1392 rt6_set_expires(rt, jiffies +
1393 clock_t_to_jiffies(cfg->fc_expires));
1395 rt6_clean_expires(rt);
1397 if (cfg->fc_protocol == RTPROT_UNSPEC)
1398 cfg->fc_protocol = RTPROT_BOOT;
1399 rt->rt6i_protocol = cfg->fc_protocol;
1401 addr_type = ipv6_addr_type(&cfg->fc_dst);
1403 if (addr_type & IPV6_ADDR_MULTICAST)
1404 rt->dst.input = ip6_mc_input;
1405 else if (cfg->fc_flags & RTF_LOCAL)
1406 rt->dst.input = ip6_input;
1408 rt->dst.input = ip6_forward;
1410 rt->dst.output = ip6_output;
1412 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1413 rt->rt6i_dst.plen = cfg->fc_dst_len;
1414 if (rt->rt6i_dst.plen == 128)
1415 rt->dst.flags |= DST_HOST;
1417 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1418 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1423 dst_init_metrics(&rt->dst, metrics, 0);
1425 #ifdef CONFIG_IPV6_SUBTREES
1426 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1427 rt->rt6i_src.plen = cfg->fc_src_len;
1430 rt->rt6i_metric = cfg->fc_metric;
1432 /* We cannot add true routes via loopback here,
1433 they would result in kernel looping; promote them to reject routes
1435 if ((cfg->fc_flags & RTF_REJECT) ||
1436 (dev && (dev->flags & IFF_LOOPBACK) &&
1437 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1438 !(cfg->fc_flags & RTF_LOCAL))) {
1439 /* hold loopback dev/idev if we haven't done so. */
1440 if (dev != net->loopback_dev) {
1445 dev = net->loopback_dev;
1447 idev = in6_dev_get(dev);
1453 rt->dst.output = ip6_pkt_discard_out;
1454 rt->dst.input = ip6_pkt_discard;
1455 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1456 switch (cfg->fc_type) {
1458 rt->dst.error = -EINVAL;
1461 rt->dst.error = -EACCES;
1464 rt->dst.error = -EAGAIN;
1467 rt->dst.error = -ENETUNREACH;
1473 if (cfg->fc_flags & RTF_GATEWAY) {
1474 const struct in6_addr *gw_addr;
1477 gw_addr = &cfg->fc_gateway;
1478 rt->rt6i_gateway = *gw_addr;
1479 gwa_type = ipv6_addr_type(gw_addr);
1481 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1482 struct rt6_info *grt;
1484 /* IPv6 strictly inhibits using not link-local
1485 addresses as nexthop address.
1486 Otherwise, router will not able to send redirects.
1487 It is very good, but in some (rare!) circumstances
1488 (SIT, PtP, NBMA NOARP links) it is handy to allow
1489 some exceptions. --ANK
1492 if (!(gwa_type & IPV6_ADDR_UNICAST))
1495 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1497 err = -EHOSTUNREACH;
1501 if (dev != grt->dst.dev) {
1507 idev = grt->rt6i_idev;
1509 in6_dev_hold(grt->rt6i_idev);
1511 if (!(grt->rt6i_flags & RTF_GATEWAY))
1519 if (!dev || (dev->flags & IFF_LOOPBACK))
1527 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1528 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1532 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1533 rt->rt6i_prefsrc.plen = 128;
1535 rt->rt6i_prefsrc.plen = 0;
1537 rt->rt6i_flags = cfg->fc_flags;
1544 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1545 int type = nla_type(nla);
1548 if (type > RTAX_MAX) {
1553 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1559 rt->rt6i_idev = idev;
1560 rt->rt6i_table = table;
1562 cfg->fc_nlinfo.nl_net = dev_net(dev);
1564 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1576 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1579 struct fib6_table *table;
1580 struct net *net = dev_net(rt->dst.dev);
1582 if (rt == net->ipv6.ip6_null_entry) {
1587 table = rt->rt6i_table;
1588 write_lock_bh(&table->tb6_lock);
1589 err = fib6_del(rt, info);
1590 write_unlock_bh(&table->tb6_lock);
1597 int ip6_del_rt(struct rt6_info *rt)
1599 struct nl_info info = {
1600 .nl_net = dev_net(rt->dst.dev),
1602 return __ip6_del_rt(rt, &info);
1605 static int ip6_route_del(struct fib6_config *cfg)
1607 struct fib6_table *table;
1608 struct fib6_node *fn;
1609 struct rt6_info *rt;
1612 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1616 read_lock_bh(&table->tb6_lock);
1618 fn = fib6_locate(&table->tb6_root,
1619 &cfg->fc_dst, cfg->fc_dst_len,
1620 &cfg->fc_src, cfg->fc_src_len);
1623 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1624 if (cfg->fc_ifindex &&
1626 rt->dst.dev->ifindex != cfg->fc_ifindex))
1628 if (cfg->fc_flags & RTF_GATEWAY &&
1629 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1631 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1634 read_unlock_bh(&table->tb6_lock);
1636 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1639 read_unlock_bh(&table->tb6_lock);
1644 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1646 struct net *net = dev_net(skb->dev);
1647 struct netevent_redirect netevent;
1648 struct rt6_info *rt, *nrt = NULL;
1649 struct ndisc_options ndopts;
1650 struct inet6_dev *in6_dev;
1651 struct neighbour *neigh;
1653 int optlen, on_link;
1656 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
1657 optlen -= sizeof(*msg);
1660 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
1664 msg = (struct rd_msg *)icmp6_hdr(skb);
1666 if (ipv6_addr_is_multicast(&msg->dest)) {
1667 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
1672 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
1674 } else if (ipv6_addr_type(&msg->target) !=
1675 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
1676 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
1680 in6_dev = __in6_dev_get(skb->dev);
1683 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
1687 * The IP source address of the Redirect MUST be the same as the current
1688 * first-hop router for the specified ICMP Destination Address.
1691 if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
1692 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
1697 if (ndopts.nd_opts_tgt_lladdr) {
1698 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
1701 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
1706 rt = (struct rt6_info *) dst;
1707 if (rt == net->ipv6.ip6_null_entry) {
1708 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1712 /* Redirect received -> path was valid.
1713 * Look, redirects are sent only in response to data packets,
1714 * so that this nexthop apparently is reachable. --ANK
1716 dst_confirm(&rt->dst);
1718 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
1723 * We have finally decided to accept it.
1726 neigh_update(neigh, lladdr, NUD_STALE,
1727 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1728 NEIGH_UPDATE_F_OVERRIDE|
1729 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1730 NEIGH_UPDATE_F_ISROUTER))
1733 nrt = ip6_rt_copy(rt, &msg->dest);
1737 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1739 nrt->rt6i_flags &= ~RTF_GATEWAY;
1741 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1743 if (ip6_ins_rt(nrt))
1746 netevent.old = &rt->dst;
1747 netevent.new = &nrt->dst;
1748 netevent.daddr = &msg->dest;
1749 netevent.neigh = neigh;
1750 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1752 if (rt->rt6i_flags & RTF_CACHE) {
1753 rt = (struct rt6_info *) dst_clone(&rt->dst);
1758 neigh_release(neigh);
1762 * Misc support functions
1765 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1766 const struct in6_addr *dest)
1768 struct net *net = dev_net(ort->dst.dev);
1769 struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1773 rt->dst.input = ort->dst.input;
1774 rt->dst.output = ort->dst.output;
1775 rt->dst.flags |= DST_HOST;
1777 rt->rt6i_dst.addr = *dest;
1778 rt->rt6i_dst.plen = 128;
1779 dst_copy_metrics(&rt->dst, &ort->dst);
1780 rt->dst.error = ort->dst.error;
1781 rt->rt6i_idev = ort->rt6i_idev;
1783 in6_dev_hold(rt->rt6i_idev);
1784 rt->dst.lastuse = jiffies;
1786 rt->rt6i_gateway = ort->rt6i_gateway;
1787 rt->rt6i_flags = ort->rt6i_flags;
1788 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1789 (RTF_DEFAULT | RTF_ADDRCONF))
1790 rt6_set_from(rt, ort);
1791 rt->rt6i_metric = 0;
1793 #ifdef CONFIG_IPV6_SUBTREES
1794 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1796 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1797 rt->rt6i_table = ort->rt6i_table;
1802 #ifdef CONFIG_IPV6_ROUTE_INFO
1803 static struct rt6_info *rt6_get_route_info(struct net *net,
1804 const struct in6_addr *prefix, int prefixlen,
1805 const struct in6_addr *gwaddr, int ifindex)
1807 struct fib6_node *fn;
1808 struct rt6_info *rt = NULL;
1809 struct fib6_table *table;
1811 table = fib6_get_table(net, RT6_TABLE_INFO);
1815 read_lock_bh(&table->tb6_lock);
1816 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1820 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1821 if (rt->dst.dev->ifindex != ifindex)
1823 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1825 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1831 read_unlock_bh(&table->tb6_lock);
1835 static struct rt6_info *rt6_add_route_info(struct net *net,
1836 const struct in6_addr *prefix, int prefixlen,
1837 const struct in6_addr *gwaddr, int ifindex,
1840 struct fib6_config cfg = {
1841 .fc_table = RT6_TABLE_INFO,
1842 .fc_metric = IP6_RT_PRIO_USER,
1843 .fc_ifindex = ifindex,
1844 .fc_dst_len = prefixlen,
1845 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1846 RTF_UP | RTF_PREF(pref),
1847 .fc_nlinfo.portid = 0,
1848 .fc_nlinfo.nlh = NULL,
1849 .fc_nlinfo.nl_net = net,
1852 cfg.fc_dst = *prefix;
1853 cfg.fc_gateway = *gwaddr;
1855 /* We should treat it as a default route if prefix length is 0. */
1857 cfg.fc_flags |= RTF_DEFAULT;
1859 ip6_route_add(&cfg);
1861 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1865 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1867 struct rt6_info *rt;
1868 struct fib6_table *table;
1870 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1874 read_lock_bh(&table->tb6_lock);
1875 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1876 if (dev == rt->dst.dev &&
1877 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1878 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1883 read_unlock_bh(&table->tb6_lock);
1887 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1888 struct net_device *dev,
1891 struct fib6_config cfg = {
1892 .fc_table = RT6_TABLE_DFLT,
1893 .fc_metric = IP6_RT_PRIO_USER,
1894 .fc_ifindex = dev->ifindex,
1895 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1896 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1897 .fc_nlinfo.portid = 0,
1898 .fc_nlinfo.nlh = NULL,
1899 .fc_nlinfo.nl_net = dev_net(dev),
1902 cfg.fc_gateway = *gwaddr;
1904 ip6_route_add(&cfg);
1906 return rt6_get_dflt_router(gwaddr, dev);
1909 void rt6_purge_dflt_routers(struct net *net)
1911 struct rt6_info *rt;
1912 struct fib6_table *table;
1914 /* NOTE: Keep consistent with rt6_get_dflt_router */
1915 table = fib6_get_table(net, RT6_TABLE_DFLT);
1920 read_lock_bh(&table->tb6_lock);
1921 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1922 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1923 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1925 read_unlock_bh(&table->tb6_lock);
1930 read_unlock_bh(&table->tb6_lock);
1933 static void rtmsg_to_fib6_config(struct net *net,
1934 struct in6_rtmsg *rtmsg,
1935 struct fib6_config *cfg)
1937 memset(cfg, 0, sizeof(*cfg));
1939 cfg->fc_table = RT6_TABLE_MAIN;
1940 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1941 cfg->fc_metric = rtmsg->rtmsg_metric;
1942 cfg->fc_expires = rtmsg->rtmsg_info;
1943 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1944 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1945 cfg->fc_flags = rtmsg->rtmsg_flags;
1947 cfg->fc_nlinfo.nl_net = net;
1949 cfg->fc_dst = rtmsg->rtmsg_dst;
1950 cfg->fc_src = rtmsg->rtmsg_src;
1951 cfg->fc_gateway = rtmsg->rtmsg_gateway;
1954 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1956 struct fib6_config cfg;
1957 struct in6_rtmsg rtmsg;
1961 case SIOCADDRT: /* Add a route */
1962 case SIOCDELRT: /* Delete a route */
1963 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1965 err = copy_from_user(&rtmsg, arg,
1966 sizeof(struct in6_rtmsg));
1970 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1975 err = ip6_route_add(&cfg);
1978 err = ip6_route_del(&cfg);
1992 * Drop the packet on the floor
1995 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1998 struct dst_entry *dst = skb_dst(skb);
1999 switch (ipstats_mib_noroutes) {
2000 case IPSTATS_MIB_INNOROUTES:
2001 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2002 if (type == IPV6_ADDR_ANY) {
2003 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2004 IPSTATS_MIB_INADDRERRORS);
2008 case IPSTATS_MIB_OUTNOROUTES:
2009 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2010 ipstats_mib_noroutes);
2013 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2018 static int ip6_pkt_discard(struct sk_buff *skb)
2020 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2023 static int ip6_pkt_discard_out(struct sk_buff *skb)
2025 skb->dev = skb_dst(skb)->dev;
2026 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2029 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2031 static int ip6_pkt_prohibit(struct sk_buff *skb)
2033 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2036 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2038 skb->dev = skb_dst(skb)->dev;
2039 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2045 * Allocate a dst for local (unicast / anycast) address.
2048 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2049 const struct in6_addr *addr,
2052 struct net *net = dev_net(idev->dev);
2053 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2056 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2057 return ERR_PTR(-ENOMEM);
2062 rt->dst.flags |= DST_HOST;
2063 rt->dst.input = ip6_input;
2064 rt->dst.output = ip6_output;
2065 rt->rt6i_idev = idev;
2067 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2069 rt->rt6i_flags |= RTF_ANYCAST;
2071 rt->rt6i_flags |= RTF_LOCAL;
2073 rt->rt6i_dst.addr = *addr;
2074 rt->rt6i_dst.plen = 128;
2075 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2077 atomic_set(&rt->dst.__refcnt, 1);
2082 int ip6_route_get_saddr(struct net *net,
2083 struct rt6_info *rt,
2084 const struct in6_addr *daddr,
2086 struct in6_addr *saddr)
2088 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2090 if (rt->rt6i_prefsrc.plen)
2091 *saddr = rt->rt6i_prefsrc.addr;
2093 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2094 daddr, prefs, saddr);
2098 /* remove deleted ip from prefsrc entries */
2099 struct arg_dev_net_ip {
2100 struct net_device *dev;
2102 struct in6_addr *addr;
2105 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2107 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2108 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2109 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2111 if (((void *)rt->dst.dev == dev || !dev) &&
2112 rt != net->ipv6.ip6_null_entry &&
2113 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2114 /* remove prefsrc entry */
2115 rt->rt6i_prefsrc.plen = 0;
2120 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2122 struct net *net = dev_net(ifp->idev->dev);
2123 struct arg_dev_net_ip adni = {
2124 .dev = ifp->idev->dev,
2128 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2131 struct arg_dev_net {
2132 struct net_device *dev;
2136 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2138 const struct arg_dev_net *adn = arg;
2139 const struct net_device *dev = adn->dev;
2141 if ((rt->dst.dev == dev || !dev) &&
2142 rt != adn->net->ipv6.ip6_null_entry)
2148 void rt6_ifdown(struct net *net, struct net_device *dev)
2150 struct arg_dev_net adn = {
2155 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2156 icmp6_clean_all(fib6_ifdown, &adn);
2159 struct rt6_mtu_change_arg {
2160 struct net_device *dev;
2164 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2166 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2167 struct inet6_dev *idev;
2169 /* In IPv6 pmtu discovery is not optional,
2170 so that RTAX_MTU lock cannot disable it.
2171 We still use this lock to block changes
2172 caused by addrconf/ndisc.
2175 idev = __in6_dev_get(arg->dev);
2179 /* For administrative MTU increase, there is no way to discover
2180 IPv6 PMTU increase, so PMTU increase should be updated here.
2181 Since RFC 1981 doesn't include administrative MTU increase
2182 update PMTU increase is a MUST. (i.e. jumbo frame)
2185 If new MTU is less than route PMTU, this new MTU will be the
2186 lowest MTU in the path, update the route PMTU to reflect PMTU
2187 decreases; if new MTU is greater than route PMTU, and the
2188 old MTU is the lowest MTU in the path, update the route PMTU
2189 to reflect the increase. In this case if the other nodes' MTU
2190 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2193 if (rt->dst.dev == arg->dev &&
2194 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2195 (dst_mtu(&rt->dst) >= arg->mtu ||
2196 (dst_mtu(&rt->dst) < arg->mtu &&
2197 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2198 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2203 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2205 struct rt6_mtu_change_arg arg = {
2210 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2213 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2214 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2215 [RTA_OIF] = { .type = NLA_U32 },
2216 [RTA_IIF] = { .type = NLA_U32 },
2217 [RTA_PRIORITY] = { .type = NLA_U32 },
2218 [RTA_METRICS] = { .type = NLA_NESTED },
2219 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2222 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2223 struct fib6_config *cfg)
2226 struct nlattr *tb[RTA_MAX+1];
2229 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2234 rtm = nlmsg_data(nlh);
2235 memset(cfg, 0, sizeof(*cfg));
2237 cfg->fc_table = rtm->rtm_table;
2238 cfg->fc_dst_len = rtm->rtm_dst_len;
2239 cfg->fc_src_len = rtm->rtm_src_len;
2240 cfg->fc_flags = RTF_UP;
2241 cfg->fc_protocol = rtm->rtm_protocol;
2242 cfg->fc_type = rtm->rtm_type;
2244 if (rtm->rtm_type == RTN_UNREACHABLE ||
2245 rtm->rtm_type == RTN_BLACKHOLE ||
2246 rtm->rtm_type == RTN_PROHIBIT ||
2247 rtm->rtm_type == RTN_THROW)
2248 cfg->fc_flags |= RTF_REJECT;
2250 if (rtm->rtm_type == RTN_LOCAL)
2251 cfg->fc_flags |= RTF_LOCAL;
2253 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2254 cfg->fc_nlinfo.nlh = nlh;
2255 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2257 if (tb[RTA_GATEWAY]) {
2258 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2259 cfg->fc_flags |= RTF_GATEWAY;
2263 int plen = (rtm->rtm_dst_len + 7) >> 3;
2265 if (nla_len(tb[RTA_DST]) < plen)
2268 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2272 int plen = (rtm->rtm_src_len + 7) >> 3;
2274 if (nla_len(tb[RTA_SRC]) < plen)
2277 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2280 if (tb[RTA_PREFSRC])
2281 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2284 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2286 if (tb[RTA_PRIORITY])
2287 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2289 if (tb[RTA_METRICS]) {
2290 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2291 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2295 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2297 if (tb[RTA_MULTIPATH]) {
2298 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2299 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2307 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2309 struct fib6_config r_cfg;
2310 struct rtnexthop *rtnh;
2313 int err = 0, last_err = 0;
2316 rtnh = (struct rtnexthop *)cfg->fc_mp;
2317 remaining = cfg->fc_mp_len;
2319 /* Parse a Multipath Entry */
2320 while (rtnh_ok(rtnh, remaining)) {
2321 memcpy(&r_cfg, cfg, sizeof(*cfg));
2322 if (rtnh->rtnh_ifindex)
2323 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2325 attrlen = rtnh_attrlen(rtnh);
2327 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2329 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2331 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
2332 r_cfg.fc_flags |= RTF_GATEWAY;
2335 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2338 /* If we are trying to remove a route, do not stop the
2339 * loop when ip6_route_del() fails (because next hop is
2340 * already gone), we should try to remove all next hops.
2343 /* If add fails, we should try to delete all
2344 * next hops that have been already added.
2350 /* Because each route is added like a single route we remove
2351 * this flag after the first nexthop (if there is a collision,
2352 * we have already fail to add the first nexthop:
2353 * fib6_add_rt2node() has reject it).
2355 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL;
2356 rtnh = rtnh_next(rtnh, &remaining);
2362 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh)
2364 struct fib6_config cfg;
2367 err = rtm_to_fib6_config(skb, nlh, &cfg);
2372 return ip6_route_multipath(&cfg, 0);
2374 return ip6_route_del(&cfg);
2377 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh)
2379 struct fib6_config cfg;
2382 err = rtm_to_fib6_config(skb, nlh, &cfg);
2387 return ip6_route_multipath(&cfg, 1);
2389 return ip6_route_add(&cfg);
2392 static inline size_t rt6_nlmsg_size(void)
2394 return NLMSG_ALIGN(sizeof(struct rtmsg))
2395 + nla_total_size(16) /* RTA_SRC */
2396 + nla_total_size(16) /* RTA_DST */
2397 + nla_total_size(16) /* RTA_GATEWAY */
2398 + nla_total_size(16) /* RTA_PREFSRC */
2399 + nla_total_size(4) /* RTA_TABLE */
2400 + nla_total_size(4) /* RTA_IIF */
2401 + nla_total_size(4) /* RTA_OIF */
2402 + nla_total_size(4) /* RTA_PRIORITY */
2403 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2404 + nla_total_size(sizeof(struct rta_cacheinfo));
2407 static int rt6_fill_node(struct net *net,
2408 struct sk_buff *skb, struct rt6_info *rt,
2409 struct in6_addr *dst, struct in6_addr *src,
2410 int iif, int type, u32 portid, u32 seq,
2411 int prefix, int nowait, unsigned int flags)
2414 struct nlmsghdr *nlh;
2418 if (prefix) { /* user wants prefix routes only */
2419 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2420 /* success since this is not a prefix route */
2425 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2429 rtm = nlmsg_data(nlh);
2430 rtm->rtm_family = AF_INET6;
2431 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2432 rtm->rtm_src_len = rt->rt6i_src.plen;
2435 table = rt->rt6i_table->tb6_id;
2437 table = RT6_TABLE_UNSPEC;
2438 rtm->rtm_table = table;
2439 if (nla_put_u32(skb, RTA_TABLE, table))
2440 goto nla_put_failure;
2441 if (rt->rt6i_flags & RTF_REJECT) {
2442 switch (rt->dst.error) {
2444 rtm->rtm_type = RTN_BLACKHOLE;
2447 rtm->rtm_type = RTN_PROHIBIT;
2450 rtm->rtm_type = RTN_THROW;
2453 rtm->rtm_type = RTN_UNREACHABLE;
2457 else if (rt->rt6i_flags & RTF_LOCAL)
2458 rtm->rtm_type = RTN_LOCAL;
2459 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2460 rtm->rtm_type = RTN_LOCAL;
2462 rtm->rtm_type = RTN_UNICAST;
2464 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2465 rtm->rtm_protocol = rt->rt6i_protocol;
2466 if (rt->rt6i_flags & RTF_DYNAMIC)
2467 rtm->rtm_protocol = RTPROT_REDIRECT;
2468 else if (rt->rt6i_flags & RTF_ADDRCONF) {
2469 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2470 rtm->rtm_protocol = RTPROT_RA;
2472 rtm->rtm_protocol = RTPROT_KERNEL;
2475 if (rt->rt6i_flags & RTF_CACHE)
2476 rtm->rtm_flags |= RTM_F_CLONED;
2479 if (nla_put(skb, RTA_DST, 16, dst))
2480 goto nla_put_failure;
2481 rtm->rtm_dst_len = 128;
2482 } else if (rtm->rtm_dst_len)
2483 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2484 goto nla_put_failure;
2485 #ifdef CONFIG_IPV6_SUBTREES
2487 if (nla_put(skb, RTA_SRC, 16, src))
2488 goto nla_put_failure;
2489 rtm->rtm_src_len = 128;
2490 } else if (rtm->rtm_src_len &&
2491 nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2492 goto nla_put_failure;
2495 #ifdef CONFIG_IPV6_MROUTE
2496 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2497 int err = ip6mr_get_route(net, skb, rtm, nowait);
2502 goto nla_put_failure;
2504 if (err == -EMSGSIZE)
2505 goto nla_put_failure;
2510 if (nla_put_u32(skb, RTA_IIF, iif))
2511 goto nla_put_failure;
2513 struct in6_addr saddr_buf;
2514 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2515 nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2516 goto nla_put_failure;
2519 if (rt->rt6i_prefsrc.plen) {
2520 struct in6_addr saddr_buf;
2521 saddr_buf = rt->rt6i_prefsrc.addr;
2522 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2523 goto nla_put_failure;
2526 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2527 goto nla_put_failure;
2529 if (rt->rt6i_flags & RTF_GATEWAY) {
2530 if (nla_put(skb, RTA_GATEWAY, 16, &rt->rt6i_gateway) < 0)
2531 goto nla_put_failure;
2535 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2536 goto nla_put_failure;
2537 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2538 goto nla_put_failure;
2540 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2542 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2543 goto nla_put_failure;
2545 return nlmsg_end(skb, nlh);
2548 nlmsg_cancel(skb, nlh);
2552 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2554 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2557 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2558 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2559 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2563 return rt6_fill_node(arg->net,
2564 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2565 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2566 prefix, 0, NLM_F_MULTI);
2569 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh)
2571 struct net *net = sock_net(in_skb->sk);
2572 struct nlattr *tb[RTA_MAX+1];
2573 struct rt6_info *rt;
2574 struct sk_buff *skb;
2577 int err, iif = 0, oif = 0;
2579 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2584 memset(&fl6, 0, sizeof(fl6));
2587 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2590 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2594 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2597 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2601 iif = nla_get_u32(tb[RTA_IIF]);
2604 oif = nla_get_u32(tb[RTA_OIF]);
2607 struct net_device *dev;
2610 dev = __dev_get_by_index(net, iif);
2616 fl6.flowi6_iif = iif;
2618 if (!ipv6_addr_any(&fl6.saddr))
2619 flags |= RT6_LOOKUP_F_HAS_SADDR;
2621 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2624 fl6.flowi6_oif = oif;
2626 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2629 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2636 /* Reserve room for dummy headers, this skb can pass
2637 through good chunk of routing engine.
2639 skb_reset_mac_header(skb);
2640 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2642 skb_dst_set(skb, &rt->dst);
2644 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2645 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
2646 nlh->nlmsg_seq, 0, 0, 0);
2652 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2657 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2659 struct sk_buff *skb;
2660 struct net *net = info->nl_net;
2665 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2667 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2671 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2672 event, info->portid, seq, 0, 0, 0);
2674 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2675 WARN_ON(err == -EMSGSIZE);
2679 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2680 info->nlh, gfp_any());
2684 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2687 static int ip6_route_dev_notify(struct notifier_block *this,
2688 unsigned long event, void *ptr)
2690 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2691 struct net *net = dev_net(dev);
2693 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2694 net->ipv6.ip6_null_entry->dst.dev = dev;
2695 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2696 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2697 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2698 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2699 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2700 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2711 #ifdef CONFIG_PROC_FS
2722 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2724 struct seq_file *m = p_arg;
2726 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2728 #ifdef CONFIG_IPV6_SUBTREES
2729 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2731 seq_puts(m, "00000000000000000000000000000000 00 ");
2733 if (rt->rt6i_flags & RTF_GATEWAY) {
2734 seq_printf(m, "%pi6", &rt->rt6i_gateway);
2736 seq_puts(m, "00000000000000000000000000000000");
2738 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2739 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2740 rt->dst.__use, rt->rt6i_flags,
2741 rt->dst.dev ? rt->dst.dev->name : "");
2745 static int ipv6_route_show(struct seq_file *m, void *v)
2747 struct net *net = (struct net *)m->private;
2748 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2752 static int ipv6_route_open(struct inode *inode, struct file *file)
2754 return single_open_net(inode, file, ipv6_route_show);
2757 static const struct file_operations ipv6_route_proc_fops = {
2758 .owner = THIS_MODULE,
2759 .open = ipv6_route_open,
2761 .llseek = seq_lseek,
2762 .release = single_release_net,
2765 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2767 struct net *net = (struct net *)seq->private;
2768 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2769 net->ipv6.rt6_stats->fib_nodes,
2770 net->ipv6.rt6_stats->fib_route_nodes,
2771 net->ipv6.rt6_stats->fib_rt_alloc,
2772 net->ipv6.rt6_stats->fib_rt_entries,
2773 net->ipv6.rt6_stats->fib_rt_cache,
2774 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2775 net->ipv6.rt6_stats->fib_discarded_routes);
2780 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2782 return single_open_net(inode, file, rt6_stats_seq_show);
2785 static const struct file_operations rt6_stats_seq_fops = {
2786 .owner = THIS_MODULE,
2787 .open = rt6_stats_seq_open,
2789 .llseek = seq_lseek,
2790 .release = single_release_net,
2792 #endif /* CONFIG_PROC_FS */
2794 #ifdef CONFIG_SYSCTL
2797 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
2798 void __user *buffer, size_t *lenp, loff_t *ppos)
2805 net = (struct net *)ctl->extra1;
2806 delay = net->ipv6.sysctl.flush_delay;
2807 proc_dointvec(ctl, write, buffer, lenp, ppos);
2808 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2812 struct ctl_table ipv6_route_table_template[] = {
2814 .procname = "flush",
2815 .data = &init_net.ipv6.sysctl.flush_delay,
2816 .maxlen = sizeof(int),
2818 .proc_handler = ipv6_sysctl_rtcache_flush
2821 .procname = "gc_thresh",
2822 .data = &ip6_dst_ops_template.gc_thresh,
2823 .maxlen = sizeof(int),
2825 .proc_handler = proc_dointvec,
2828 .procname = "max_size",
2829 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2830 .maxlen = sizeof(int),
2832 .proc_handler = proc_dointvec,
2835 .procname = "gc_min_interval",
2836 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2837 .maxlen = sizeof(int),
2839 .proc_handler = proc_dointvec_jiffies,
2842 .procname = "gc_timeout",
2843 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2844 .maxlen = sizeof(int),
2846 .proc_handler = proc_dointvec_jiffies,
2849 .procname = "gc_interval",
2850 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2851 .maxlen = sizeof(int),
2853 .proc_handler = proc_dointvec_jiffies,
2856 .procname = "gc_elasticity",
2857 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2858 .maxlen = sizeof(int),
2860 .proc_handler = proc_dointvec,
2863 .procname = "mtu_expires",
2864 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2865 .maxlen = sizeof(int),
2867 .proc_handler = proc_dointvec_jiffies,
2870 .procname = "min_adv_mss",
2871 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2872 .maxlen = sizeof(int),
2874 .proc_handler = proc_dointvec,
2877 .procname = "gc_min_interval_ms",
2878 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2879 .maxlen = sizeof(int),
2881 .proc_handler = proc_dointvec_ms_jiffies,
2886 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2888 struct ctl_table *table;
2890 table = kmemdup(ipv6_route_table_template,
2891 sizeof(ipv6_route_table_template),
2895 table[0].data = &net->ipv6.sysctl.flush_delay;
2896 table[0].extra1 = net;
2897 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2898 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2899 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2900 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2901 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2902 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2903 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2904 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2905 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2907 /* Don't export sysctls to unprivileged users */
2908 if (net->user_ns != &init_user_ns)
2909 table[0].procname = NULL;
2916 static int __net_init ip6_route_net_init(struct net *net)
2920 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2921 sizeof(net->ipv6.ip6_dst_ops));
2923 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2924 goto out_ip6_dst_ops;
2926 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2927 sizeof(*net->ipv6.ip6_null_entry),
2929 if (!net->ipv6.ip6_null_entry)
2930 goto out_ip6_dst_entries;
2931 net->ipv6.ip6_null_entry->dst.path =
2932 (struct dst_entry *)net->ipv6.ip6_null_entry;
2933 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2934 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2935 ip6_template_metrics, true);
2937 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2938 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2939 sizeof(*net->ipv6.ip6_prohibit_entry),
2941 if (!net->ipv6.ip6_prohibit_entry)
2942 goto out_ip6_null_entry;
2943 net->ipv6.ip6_prohibit_entry->dst.path =
2944 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2945 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2946 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2947 ip6_template_metrics, true);
2949 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2950 sizeof(*net->ipv6.ip6_blk_hole_entry),
2952 if (!net->ipv6.ip6_blk_hole_entry)
2953 goto out_ip6_prohibit_entry;
2954 net->ipv6.ip6_blk_hole_entry->dst.path =
2955 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2956 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2957 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2958 ip6_template_metrics, true);
2961 net->ipv6.sysctl.flush_delay = 0;
2962 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2963 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2964 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2965 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2966 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2967 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2968 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2970 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2976 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2977 out_ip6_prohibit_entry:
2978 kfree(net->ipv6.ip6_prohibit_entry);
2980 kfree(net->ipv6.ip6_null_entry);
2982 out_ip6_dst_entries:
2983 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2988 static void __net_exit ip6_route_net_exit(struct net *net)
2990 kfree(net->ipv6.ip6_null_entry);
2991 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2992 kfree(net->ipv6.ip6_prohibit_entry);
2993 kfree(net->ipv6.ip6_blk_hole_entry);
2995 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2998 static int __net_init ip6_route_net_init_late(struct net *net)
3000 #ifdef CONFIG_PROC_FS
3001 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3002 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3007 static void __net_exit ip6_route_net_exit_late(struct net *net)
3009 #ifdef CONFIG_PROC_FS
3010 remove_proc_entry("ipv6_route", net->proc_net);
3011 remove_proc_entry("rt6_stats", net->proc_net);
3015 static struct pernet_operations ip6_route_net_ops = {
3016 .init = ip6_route_net_init,
3017 .exit = ip6_route_net_exit,
3020 static int __net_init ipv6_inetpeer_init(struct net *net)
3022 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3026 inet_peer_base_init(bp);
3027 net->ipv6.peers = bp;
3031 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3033 struct inet_peer_base *bp = net->ipv6.peers;
3035 net->ipv6.peers = NULL;
3036 inetpeer_invalidate_tree(bp);
3040 static struct pernet_operations ipv6_inetpeer_ops = {
3041 .init = ipv6_inetpeer_init,
3042 .exit = ipv6_inetpeer_exit,
3045 static struct pernet_operations ip6_route_net_late_ops = {
3046 .init = ip6_route_net_init_late,
3047 .exit = ip6_route_net_exit_late,
3050 static struct notifier_block ip6_route_dev_notifier = {
3051 .notifier_call = ip6_route_dev_notify,
3055 int __init ip6_route_init(void)
3060 ip6_dst_ops_template.kmem_cachep =
3061 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3062 SLAB_HWCACHE_ALIGN, NULL);
3063 if (!ip6_dst_ops_template.kmem_cachep)
3066 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3068 goto out_kmem_cache;
3070 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3072 goto out_dst_entries;
3074 ret = register_pernet_subsys(&ip6_route_net_ops);
3076 goto out_register_inetpeer;
3078 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3080 /* Registering of the loopback is done before this portion of code,
3081 * the loopback reference in rt6_info will not be taken, do it
3082 * manually for init_net */
3083 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3084 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3085 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3086 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3087 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3088 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3089 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3093 goto out_register_subsys;
3099 ret = fib6_rules_init();
3103 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3105 goto fib6_rules_init;
3108 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3109 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3110 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3111 goto out_register_late_subsys;
3113 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3115 goto out_register_late_subsys;
3120 out_register_late_subsys:
3121 unregister_pernet_subsys(&ip6_route_net_late_ops);
3123 fib6_rules_cleanup();
3128 out_register_subsys:
3129 unregister_pernet_subsys(&ip6_route_net_ops);
3130 out_register_inetpeer:
3131 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3133 dst_entries_destroy(&ip6_dst_blackhole_ops);
3135 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3139 void ip6_route_cleanup(void)
3141 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3142 unregister_pernet_subsys(&ip6_route_net_late_ops);
3143 fib6_rules_cleanup();
3146 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3147 unregister_pernet_subsys(&ip6_route_net_ops);
3148 dst_entries_destroy(&ip6_dst_blackhole_ops);
3149 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);