]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv6/route.c
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...
[karo-tx-linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            const struct in6_addr *prefix, int prefixlen,
93                                            const struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            const struct in6_addr *prefix, int prefixlen,
97                                            const struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102         struct rt6_info *rt = (struct rt6_info *) dst;
103         struct inet_peer *peer;
104         u32 *p = NULL;
105
106         if (!rt->rt6i_peer)
107                 rt6_bind_peer(rt, 1);
108
109         peer = rt->rt6i_peer;
110         if (peer) {
111                 u32 *old_p = __DST_METRICS_PTR(old);
112                 unsigned long prev, new;
113
114                 p = peer->metrics;
115                 if (inet_metrics_new(peer))
116                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117
118                 new = (unsigned long) p;
119                 prev = cmpxchg(&dst->_metrics, old, new);
120
121                 if (prev != old) {
122                         p = __DST_METRICS_PTR(prev);
123                         if (prev & DST_METRICS_READ_ONLY)
124                                 p = NULL;
125                 }
126         }
127         return p;
128 }
129
130 static struct dst_ops ip6_dst_ops_template = {
131         .family                 =       AF_INET6,
132         .protocol               =       cpu_to_be16(ETH_P_IPV6),
133         .gc                     =       ip6_dst_gc,
134         .gc_thresh              =       1024,
135         .check                  =       ip6_dst_check,
136         .default_advmss         =       ip6_default_advmss,
137         .default_mtu            =       ip6_default_mtu,
138         .cow_metrics            =       ipv6_cow_metrics,
139         .destroy                =       ip6_dst_destroy,
140         .ifdown                 =       ip6_dst_ifdown,
141         .negative_advice        =       ip6_negative_advice,
142         .link_failure           =       ip6_link_failure,
143         .update_pmtu            =       ip6_rt_update_pmtu,
144         .local_out              =       __ip6_local_out,
145 };
146
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149         return 0;
150 }
151
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155
156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
157                                          unsigned long old)
158 {
159         return NULL;
160 }
161
162 static struct dst_ops ip6_dst_blackhole_ops = {
163         .family                 =       AF_INET6,
164         .protocol               =       cpu_to_be16(ETH_P_IPV6),
165         .destroy                =       ip6_dst_destroy,
166         .check                  =       ip6_dst_check,
167         .default_mtu            =       ip6_blackhole_default_mtu,
168         .default_advmss         =       ip6_default_advmss,
169         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
170         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
171 };
172
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174         [RTAX_HOPLIMIT - 1] = 255,
175 };
176
177 static struct rt6_info ip6_null_entry_template = {
178         .dst = {
179                 .__refcnt       = ATOMIC_INIT(1),
180                 .__use          = 1,
181                 .obsolete       = -1,
182                 .error          = -ENETUNREACH,
183                 .input          = ip6_pkt_discard,
184                 .output         = ip6_pkt_discard_out,
185         },
186         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
187         .rt6i_protocol  = RTPROT_KERNEL,
188         .rt6i_metric    = ~(u32) 0,
189         .rt6i_ref       = ATOMIC_INIT(1),
190 };
191
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
193
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
196
197 static struct rt6_info ip6_prohibit_entry_template = {
198         .dst = {
199                 .__refcnt       = ATOMIC_INIT(1),
200                 .__use          = 1,
201                 .obsolete       = -1,
202                 .error          = -EACCES,
203                 .input          = ip6_pkt_prohibit,
204                 .output         = ip6_pkt_prohibit_out,
205         },
206         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
207         .rt6i_protocol  = RTPROT_KERNEL,
208         .rt6i_metric    = ~(u32) 0,
209         .rt6i_ref       = ATOMIC_INIT(1),
210 };
211
212 static struct rt6_info ip6_blk_hole_entry_template = {
213         .dst = {
214                 .__refcnt       = ATOMIC_INIT(1),
215                 .__use          = 1,
216                 .obsolete       = -1,
217                 .error          = -EINVAL,
218                 .input          = dst_discard,
219                 .output         = dst_discard,
220         },
221         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
222         .rt6i_protocol  = RTPROT_KERNEL,
223         .rt6i_metric    = ~(u32) 0,
224         .rt6i_ref       = ATOMIC_INIT(1),
225 };
226
227 #endif
228
229 /* allocate dst with ip6_dst_ops */
230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
231                                              struct net_device *dev)
232 {
233         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, 0);
234
235         memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
236
237         return rt;
238 }
239
240 static void ip6_dst_destroy(struct dst_entry *dst)
241 {
242         struct rt6_info *rt = (struct rt6_info *)dst;
243         struct inet6_dev *idev = rt->rt6i_idev;
244         struct inet_peer *peer = rt->rt6i_peer;
245
246         if (idev != NULL) {
247                 rt->rt6i_idev = NULL;
248                 in6_dev_put(idev);
249         }
250         if (peer) {
251                 rt->rt6i_peer = NULL;
252                 inet_putpeer(peer);
253         }
254 }
255
256 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
257
258 static u32 rt6_peer_genid(void)
259 {
260         return atomic_read(&__rt6_peer_genid);
261 }
262
263 void rt6_bind_peer(struct rt6_info *rt, int create)
264 {
265         struct inet_peer *peer;
266
267         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
268         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
269                 inet_putpeer(peer);
270         else
271                 rt->rt6i_peer_genid = rt6_peer_genid();
272 }
273
274 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
275                            int how)
276 {
277         struct rt6_info *rt = (struct rt6_info *)dst;
278         struct inet6_dev *idev = rt->rt6i_idev;
279         struct net_device *loopback_dev =
280                 dev_net(dev)->loopback_dev;
281
282         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
283                 struct inet6_dev *loopback_idev =
284                         in6_dev_get(loopback_dev);
285                 if (loopback_idev != NULL) {
286                         rt->rt6i_idev = loopback_idev;
287                         in6_dev_put(idev);
288                 }
289         }
290 }
291
292 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
293 {
294         return (rt->rt6i_flags & RTF_EXPIRES) &&
295                 time_after(jiffies, rt->rt6i_expires);
296 }
297
298 static inline int rt6_need_strict(const struct in6_addr *daddr)
299 {
300         return ipv6_addr_type(daddr) &
301                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
302 }
303
304 /*
305  *      Route lookup. Any table->tb6_lock is implied.
306  */
307
308 static inline struct rt6_info *rt6_device_match(struct net *net,
309                                                     struct rt6_info *rt,
310                                                     const struct in6_addr *saddr,
311                                                     int oif,
312                                                     int flags)
313 {
314         struct rt6_info *local = NULL;
315         struct rt6_info *sprt;
316
317         if (!oif && ipv6_addr_any(saddr))
318                 goto out;
319
320         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
321                 struct net_device *dev = sprt->rt6i_dev;
322
323                 if (oif) {
324                         if (dev->ifindex == oif)
325                                 return sprt;
326                         if (dev->flags & IFF_LOOPBACK) {
327                                 if (sprt->rt6i_idev == NULL ||
328                                     sprt->rt6i_idev->dev->ifindex != oif) {
329                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
330                                                 continue;
331                                         if (local && (!oif ||
332                                                       local->rt6i_idev->dev->ifindex == oif))
333                                                 continue;
334                                 }
335                                 local = sprt;
336                         }
337                 } else {
338                         if (ipv6_chk_addr(net, saddr, dev,
339                                           flags & RT6_LOOKUP_F_IFACE))
340                                 return sprt;
341                 }
342         }
343
344         if (oif) {
345                 if (local)
346                         return local;
347
348                 if (flags & RT6_LOOKUP_F_IFACE)
349                         return net->ipv6.ip6_null_entry;
350         }
351 out:
352         return rt;
353 }
354
355 #ifdef CONFIG_IPV6_ROUTER_PREF
356 static void rt6_probe(struct rt6_info *rt)
357 {
358         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
359         /*
360          * Okay, this does not seem to be appropriate
361          * for now, however, we need to check if it
362          * is really so; aka Router Reachability Probing.
363          *
364          * Router Reachability Probe MUST be rate-limited
365          * to no more than one per minute.
366          */
367         if (!neigh || (neigh->nud_state & NUD_VALID))
368                 return;
369         read_lock_bh(&neigh->lock);
370         if (!(neigh->nud_state & NUD_VALID) &&
371             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
372                 struct in6_addr mcaddr;
373                 struct in6_addr *target;
374
375                 neigh->updated = jiffies;
376                 read_unlock_bh(&neigh->lock);
377
378                 target = (struct in6_addr *)&neigh->primary_key;
379                 addrconf_addr_solict_mult(target, &mcaddr);
380                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
381         } else
382                 read_unlock_bh(&neigh->lock);
383 }
384 #else
385 static inline void rt6_probe(struct rt6_info *rt)
386 {
387 }
388 #endif
389
390 /*
391  * Default Router Selection (RFC 2461 6.3.6)
392  */
393 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
394 {
395         struct net_device *dev = rt->rt6i_dev;
396         if (!oif || dev->ifindex == oif)
397                 return 2;
398         if ((dev->flags & IFF_LOOPBACK) &&
399             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
400                 return 1;
401         return 0;
402 }
403
404 static inline int rt6_check_neigh(struct rt6_info *rt)
405 {
406         struct neighbour *neigh = rt->rt6i_nexthop;
407         int m;
408         if (rt->rt6i_flags & RTF_NONEXTHOP ||
409             !(rt->rt6i_flags & RTF_GATEWAY))
410                 m = 1;
411         else if (neigh) {
412                 read_lock_bh(&neigh->lock);
413                 if (neigh->nud_state & NUD_VALID)
414                         m = 2;
415 #ifdef CONFIG_IPV6_ROUTER_PREF
416                 else if (neigh->nud_state & NUD_FAILED)
417                         m = 0;
418 #endif
419                 else
420                         m = 1;
421                 read_unlock_bh(&neigh->lock);
422         } else
423                 m = 0;
424         return m;
425 }
426
427 static int rt6_score_route(struct rt6_info *rt, int oif,
428                            int strict)
429 {
430         int m, n;
431
432         m = rt6_check_dev(rt, oif);
433         if (!m && (strict & RT6_LOOKUP_F_IFACE))
434                 return -1;
435 #ifdef CONFIG_IPV6_ROUTER_PREF
436         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
437 #endif
438         n = rt6_check_neigh(rt);
439         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
440                 return -1;
441         return m;
442 }
443
444 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
445                                    int *mpri, struct rt6_info *match)
446 {
447         int m;
448
449         if (rt6_check_expired(rt))
450                 goto out;
451
452         m = rt6_score_route(rt, oif, strict);
453         if (m < 0)
454                 goto out;
455
456         if (m > *mpri) {
457                 if (strict & RT6_LOOKUP_F_REACHABLE)
458                         rt6_probe(match);
459                 *mpri = m;
460                 match = rt;
461         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
462                 rt6_probe(rt);
463         }
464
465 out:
466         return match;
467 }
468
469 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
470                                      struct rt6_info *rr_head,
471                                      u32 metric, int oif, int strict)
472 {
473         struct rt6_info *rt, *match;
474         int mpri = -1;
475
476         match = NULL;
477         for (rt = rr_head; rt && rt->rt6i_metric == metric;
478              rt = rt->dst.rt6_next)
479                 match = find_match(rt, oif, strict, &mpri, match);
480         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
481              rt = rt->dst.rt6_next)
482                 match = find_match(rt, oif, strict, &mpri, match);
483
484         return match;
485 }
486
487 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
488 {
489         struct rt6_info *match, *rt0;
490         struct net *net;
491
492         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
493                   __func__, fn->leaf, oif);
494
495         rt0 = fn->rr_ptr;
496         if (!rt0)
497                 fn->rr_ptr = rt0 = fn->leaf;
498
499         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
500
501         if (!match &&
502             (strict & RT6_LOOKUP_F_REACHABLE)) {
503                 struct rt6_info *next = rt0->dst.rt6_next;
504
505                 /* no entries matched; do round-robin */
506                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
507                         next = fn->leaf;
508
509                 if (next != rt0)
510                         fn->rr_ptr = next;
511         }
512
513         RT6_TRACE("%s() => %p\n",
514                   __func__, match);
515
516         net = dev_net(rt0->rt6i_dev);
517         return match ? match : net->ipv6.ip6_null_entry;
518 }
519
520 #ifdef CONFIG_IPV6_ROUTE_INFO
521 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
522                   const struct in6_addr *gwaddr)
523 {
524         struct net *net = dev_net(dev);
525         struct route_info *rinfo = (struct route_info *) opt;
526         struct in6_addr prefix_buf, *prefix;
527         unsigned int pref;
528         unsigned long lifetime;
529         struct rt6_info *rt;
530
531         if (len < sizeof(struct route_info)) {
532                 return -EINVAL;
533         }
534
535         /* Sanity check for prefix_len and length */
536         if (rinfo->length > 3) {
537                 return -EINVAL;
538         } else if (rinfo->prefix_len > 128) {
539                 return -EINVAL;
540         } else if (rinfo->prefix_len > 64) {
541                 if (rinfo->length < 2) {
542                         return -EINVAL;
543                 }
544         } else if (rinfo->prefix_len > 0) {
545                 if (rinfo->length < 1) {
546                         return -EINVAL;
547                 }
548         }
549
550         pref = rinfo->route_pref;
551         if (pref == ICMPV6_ROUTER_PREF_INVALID)
552                 return -EINVAL;
553
554         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
555
556         if (rinfo->length == 3)
557                 prefix = (struct in6_addr *)rinfo->prefix;
558         else {
559                 /* this function is safe */
560                 ipv6_addr_prefix(&prefix_buf,
561                                  (struct in6_addr *)rinfo->prefix,
562                                  rinfo->prefix_len);
563                 prefix = &prefix_buf;
564         }
565
566         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
567                                 dev->ifindex);
568
569         if (rt && !lifetime) {
570                 ip6_del_rt(rt);
571                 rt = NULL;
572         }
573
574         if (!rt && lifetime)
575                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
576                                         pref);
577         else if (rt)
578                 rt->rt6i_flags = RTF_ROUTEINFO |
579                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
580
581         if (rt) {
582                 if (!addrconf_finite_timeout(lifetime)) {
583                         rt->rt6i_flags &= ~RTF_EXPIRES;
584                 } else {
585                         rt->rt6i_expires = jiffies + HZ * lifetime;
586                         rt->rt6i_flags |= RTF_EXPIRES;
587                 }
588                 dst_release(&rt->dst);
589         }
590         return 0;
591 }
592 #endif
593
594 #define BACKTRACK(__net, saddr)                 \
595 do { \
596         if (rt == __net->ipv6.ip6_null_entry) { \
597                 struct fib6_node *pn; \
598                 while (1) { \
599                         if (fn->fn_flags & RTN_TL_ROOT) \
600                                 goto out; \
601                         pn = fn->parent; \
602                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
603                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
604                         else \
605                                 fn = pn; \
606                         if (fn->fn_flags & RTN_RTINFO) \
607                                 goto restart; \
608                 } \
609         } \
610 } while(0)
611
612 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
613                                              struct fib6_table *table,
614                                              struct flowi6 *fl6, int flags)
615 {
616         struct fib6_node *fn;
617         struct rt6_info *rt;
618
619         read_lock_bh(&table->tb6_lock);
620         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
621 restart:
622         rt = fn->leaf;
623         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
624         BACKTRACK(net, &fl6->saddr);
625 out:
626         dst_use(&rt->dst, jiffies);
627         read_unlock_bh(&table->tb6_lock);
628         return rt;
629
630 }
631
632 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
633                             const struct in6_addr *saddr, int oif, int strict)
634 {
635         struct flowi6 fl6 = {
636                 .flowi6_oif = oif,
637                 .daddr = *daddr,
638         };
639         struct dst_entry *dst;
640         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
641
642         if (saddr) {
643                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
644                 flags |= RT6_LOOKUP_F_HAS_SADDR;
645         }
646
647         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
648         if (dst->error == 0)
649                 return (struct rt6_info *) dst;
650
651         dst_release(dst);
652
653         return NULL;
654 }
655
656 EXPORT_SYMBOL(rt6_lookup);
657
658 /* ip6_ins_rt is called with FREE table->tb6_lock.
659    It takes new route entry, the addition fails by any reason the
660    route is freed. In any case, if caller does not hold it, it may
661    be destroyed.
662  */
663
664 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
665 {
666         int err;
667         struct fib6_table *table;
668
669         table = rt->rt6i_table;
670         write_lock_bh(&table->tb6_lock);
671         err = fib6_add(&table->tb6_root, rt, info);
672         write_unlock_bh(&table->tb6_lock);
673
674         return err;
675 }
676
677 int ip6_ins_rt(struct rt6_info *rt)
678 {
679         struct nl_info info = {
680                 .nl_net = dev_net(rt->rt6i_dev),
681         };
682         return __ip6_ins_rt(rt, &info);
683 }
684
685 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
686                                       const struct in6_addr *saddr)
687 {
688         struct rt6_info *rt;
689
690         /*
691          *      Clone the route.
692          */
693
694         rt = ip6_rt_copy(ort);
695
696         if (rt) {
697                 struct neighbour *neigh;
698                 int attempts = !in_softirq();
699
700                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
701                         if (rt->rt6i_dst.plen != 128 &&
702                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
703                                 rt->rt6i_flags |= RTF_ANYCAST;
704                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
705                 }
706
707                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
708                 rt->rt6i_dst.plen = 128;
709                 rt->rt6i_flags |= RTF_CACHE;
710                 rt->dst.flags |= DST_HOST;
711
712 #ifdef CONFIG_IPV6_SUBTREES
713                 if (rt->rt6i_src.plen && saddr) {
714                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
715                         rt->rt6i_src.plen = 128;
716                 }
717 #endif
718
719         retry:
720                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
721                 if (IS_ERR(neigh)) {
722                         struct net *net = dev_net(rt->rt6i_dev);
723                         int saved_rt_min_interval =
724                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
725                         int saved_rt_elasticity =
726                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
727
728                         if (attempts-- > 0) {
729                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
730                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
731
732                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
733
734                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
735                                         saved_rt_elasticity;
736                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
737                                         saved_rt_min_interval;
738                                 goto retry;
739                         }
740
741                         if (net_ratelimit())
742                                 printk(KERN_WARNING
743                                        "ipv6: Neighbour table overflow.\n");
744                         dst_free(&rt->dst);
745                         return NULL;
746                 }
747                 rt->rt6i_nexthop = neigh;
748
749         }
750
751         return rt;
752 }
753
754 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
755 {
756         struct rt6_info *rt = ip6_rt_copy(ort);
757         if (rt) {
758                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
759                 rt->rt6i_dst.plen = 128;
760                 rt->rt6i_flags |= RTF_CACHE;
761                 rt->dst.flags |= DST_HOST;
762                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
763         }
764         return rt;
765 }
766
767 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
768                                       struct flowi6 *fl6, int flags)
769 {
770         struct fib6_node *fn;
771         struct rt6_info *rt, *nrt;
772         int strict = 0;
773         int attempts = 3;
774         int err;
775         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
776
777         strict |= flags & RT6_LOOKUP_F_IFACE;
778
779 relookup:
780         read_lock_bh(&table->tb6_lock);
781
782 restart_2:
783         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
784
785 restart:
786         rt = rt6_select(fn, oif, strict | reachable);
787
788         BACKTRACK(net, &fl6->saddr);
789         if (rt == net->ipv6.ip6_null_entry ||
790             rt->rt6i_flags & RTF_CACHE)
791                 goto out;
792
793         dst_hold(&rt->dst);
794         read_unlock_bh(&table->tb6_lock);
795
796         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
797                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
798         else if (!(rt->dst.flags & DST_HOST))
799                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
800         else
801                 goto out2;
802
803         dst_release(&rt->dst);
804         rt = nrt ? : net->ipv6.ip6_null_entry;
805
806         dst_hold(&rt->dst);
807         if (nrt) {
808                 err = ip6_ins_rt(nrt);
809                 if (!err)
810                         goto out2;
811         }
812
813         if (--attempts <= 0)
814                 goto out2;
815
816         /*
817          * Race condition! In the gap, when table->tb6_lock was
818          * released someone could insert this route.  Relookup.
819          */
820         dst_release(&rt->dst);
821         goto relookup;
822
823 out:
824         if (reachable) {
825                 reachable = 0;
826                 goto restart_2;
827         }
828         dst_hold(&rt->dst);
829         read_unlock_bh(&table->tb6_lock);
830 out2:
831         rt->dst.lastuse = jiffies;
832         rt->dst.__use++;
833
834         return rt;
835 }
836
837 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
838                                             struct flowi6 *fl6, int flags)
839 {
840         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
841 }
842
843 void ip6_route_input(struct sk_buff *skb)
844 {
845         const struct ipv6hdr *iph = ipv6_hdr(skb);
846         struct net *net = dev_net(skb->dev);
847         int flags = RT6_LOOKUP_F_HAS_SADDR;
848         struct flowi6 fl6 = {
849                 .flowi6_iif = skb->dev->ifindex,
850                 .daddr = iph->daddr,
851                 .saddr = iph->saddr,
852                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
853                 .flowi6_mark = skb->mark,
854                 .flowi6_proto = iph->nexthdr,
855         };
856
857         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
858                 flags |= RT6_LOOKUP_F_IFACE;
859
860         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
861 }
862
863 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
864                                              struct flowi6 *fl6, int flags)
865 {
866         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
867 }
868
869 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
870                                     struct flowi6 *fl6)
871 {
872         int flags = 0;
873
874         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
875                 flags |= RT6_LOOKUP_F_IFACE;
876
877         if (!ipv6_addr_any(&fl6->saddr))
878                 flags |= RT6_LOOKUP_F_HAS_SADDR;
879         else if (sk)
880                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
881
882         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
883 }
884
885 EXPORT_SYMBOL(ip6_route_output);
886
887 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
888 {
889         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
890         struct dst_entry *new = NULL;
891
892         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
893         if (rt) {
894                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
895
896                 new = &rt->dst;
897
898                 new->__use = 1;
899                 new->input = dst_discard;
900                 new->output = dst_discard;
901
902                 dst_copy_metrics(new, &ort->dst);
903                 rt->rt6i_idev = ort->rt6i_idev;
904                 if (rt->rt6i_idev)
905                         in6_dev_hold(rt->rt6i_idev);
906                 rt->rt6i_expires = 0;
907
908                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
909                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
910                 rt->rt6i_metric = 0;
911
912                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
913 #ifdef CONFIG_IPV6_SUBTREES
914                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
915 #endif
916
917                 dst_free(new);
918         }
919
920         dst_release(dst_orig);
921         return new ? new : ERR_PTR(-ENOMEM);
922 }
923
924 /*
925  *      Destination cache support functions
926  */
927
928 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
929 {
930         struct rt6_info *rt;
931
932         rt = (struct rt6_info *) dst;
933
934         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
935                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
936                         if (!rt->rt6i_peer)
937                                 rt6_bind_peer(rt, 0);
938                         rt->rt6i_peer_genid = rt6_peer_genid();
939                 }
940                 return dst;
941         }
942         return NULL;
943 }
944
945 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
946 {
947         struct rt6_info *rt = (struct rt6_info *) dst;
948
949         if (rt) {
950                 if (rt->rt6i_flags & RTF_CACHE) {
951                         if (rt6_check_expired(rt)) {
952                                 ip6_del_rt(rt);
953                                 dst = NULL;
954                         }
955                 } else {
956                         dst_release(dst);
957                         dst = NULL;
958                 }
959         }
960         return dst;
961 }
962
963 static void ip6_link_failure(struct sk_buff *skb)
964 {
965         struct rt6_info *rt;
966
967         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
968
969         rt = (struct rt6_info *) skb_dst(skb);
970         if (rt) {
971                 if (rt->rt6i_flags&RTF_CACHE) {
972                         dst_set_expires(&rt->dst, 0);
973                         rt->rt6i_flags |= RTF_EXPIRES;
974                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
975                         rt->rt6i_node->fn_sernum = -1;
976         }
977 }
978
979 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
980 {
981         struct rt6_info *rt6 = (struct rt6_info*)dst;
982
983         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
984                 rt6->rt6i_flags |= RTF_MODIFIED;
985                 if (mtu < IPV6_MIN_MTU) {
986                         u32 features = dst_metric(dst, RTAX_FEATURES);
987                         mtu = IPV6_MIN_MTU;
988                         features |= RTAX_FEATURE_ALLFRAG;
989                         dst_metric_set(dst, RTAX_FEATURES, features);
990                 }
991                 dst_metric_set(dst, RTAX_MTU, mtu);
992         }
993 }
994
995 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
996 {
997         struct net_device *dev = dst->dev;
998         unsigned int mtu = dst_mtu(dst);
999         struct net *net = dev_net(dev);
1000
1001         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1002
1003         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1004                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1005
1006         /*
1007          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1008          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1009          * IPV6_MAXPLEN is also valid and means: "any MSS,
1010          * rely only on pmtu discovery"
1011          */
1012         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1013                 mtu = IPV6_MAXPLEN;
1014         return mtu;
1015 }
1016
1017 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1018 {
1019         unsigned int mtu = IPV6_MIN_MTU;
1020         struct inet6_dev *idev;
1021
1022         rcu_read_lock();
1023         idev = __in6_dev_get(dst->dev);
1024         if (idev)
1025                 mtu = idev->cnf.mtu6;
1026         rcu_read_unlock();
1027
1028         return mtu;
1029 }
1030
1031 static struct dst_entry *icmp6_dst_gc_list;
1032 static DEFINE_SPINLOCK(icmp6_dst_lock);
1033
1034 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1035                                   struct neighbour *neigh,
1036                                   const struct in6_addr *addr)
1037 {
1038         struct rt6_info *rt;
1039         struct inet6_dev *idev = in6_dev_get(dev);
1040         struct net *net = dev_net(dev);
1041
1042         if (unlikely(idev == NULL))
1043                 return NULL;
1044
1045         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev);
1046         if (unlikely(rt == NULL)) {
1047                 in6_dev_put(idev);
1048                 goto out;
1049         }
1050
1051         if (neigh)
1052                 neigh_hold(neigh);
1053         else {
1054                 neigh = ndisc_get_neigh(dev, addr);
1055                 if (IS_ERR(neigh))
1056                         neigh = NULL;
1057         }
1058
1059         rt->rt6i_idev     = idev;
1060         rt->rt6i_nexthop  = neigh;
1061         atomic_set(&rt->dst.__refcnt, 1);
1062         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1063         rt->dst.output  = ip6_output;
1064
1065 #if 0   /* there's no chance to use these for ndisc */
1066         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1067                                 ? DST_HOST
1068                                 : 0;
1069         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1070         rt->rt6i_dst.plen = 128;
1071 #endif
1072
1073         spin_lock_bh(&icmp6_dst_lock);
1074         rt->dst.next = icmp6_dst_gc_list;
1075         icmp6_dst_gc_list = &rt->dst;
1076         spin_unlock_bh(&icmp6_dst_lock);
1077
1078         fib6_force_start_gc(net);
1079
1080 out:
1081         return &rt->dst;
1082 }
1083
1084 int icmp6_dst_gc(void)
1085 {
1086         struct dst_entry *dst, **pprev;
1087         int more = 0;
1088
1089         spin_lock_bh(&icmp6_dst_lock);
1090         pprev = &icmp6_dst_gc_list;
1091
1092         while ((dst = *pprev) != NULL) {
1093                 if (!atomic_read(&dst->__refcnt)) {
1094                         *pprev = dst->next;
1095                         dst_free(dst);
1096                 } else {
1097                         pprev = &dst->next;
1098                         ++more;
1099                 }
1100         }
1101
1102         spin_unlock_bh(&icmp6_dst_lock);
1103
1104         return more;
1105 }
1106
1107 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1108                             void *arg)
1109 {
1110         struct dst_entry *dst, **pprev;
1111
1112         spin_lock_bh(&icmp6_dst_lock);
1113         pprev = &icmp6_dst_gc_list;
1114         while ((dst = *pprev) != NULL) {
1115                 struct rt6_info *rt = (struct rt6_info *) dst;
1116                 if (func(rt, arg)) {
1117                         *pprev = dst->next;
1118                         dst_free(dst);
1119                 } else {
1120                         pprev = &dst->next;
1121                 }
1122         }
1123         spin_unlock_bh(&icmp6_dst_lock);
1124 }
1125
1126 static int ip6_dst_gc(struct dst_ops *ops)
1127 {
1128         unsigned long now = jiffies;
1129         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1130         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1131         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1132         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1133         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1134         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1135         int entries;
1136
1137         entries = dst_entries_get_fast(ops);
1138         if (time_after(rt_last_gc + rt_min_interval, now) &&
1139             entries <= rt_max_size)
1140                 goto out;
1141
1142         net->ipv6.ip6_rt_gc_expire++;
1143         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1144         net->ipv6.ip6_rt_last_gc = now;
1145         entries = dst_entries_get_slow(ops);
1146         if (entries < ops->gc_thresh)
1147                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1148 out:
1149         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1150         return entries > rt_max_size;
1151 }
1152
1153 /* Clean host part of a prefix. Not necessary in radix tree,
1154    but results in cleaner routing tables.
1155
1156    Remove it only when all the things will work!
1157  */
1158
1159 int ip6_dst_hoplimit(struct dst_entry *dst)
1160 {
1161         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1162         if (hoplimit == 0) {
1163                 struct net_device *dev = dst->dev;
1164                 struct inet6_dev *idev;
1165
1166                 rcu_read_lock();
1167                 idev = __in6_dev_get(dev);
1168                 if (idev)
1169                         hoplimit = idev->cnf.hop_limit;
1170                 else
1171                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1172                 rcu_read_unlock();
1173         }
1174         return hoplimit;
1175 }
1176 EXPORT_SYMBOL(ip6_dst_hoplimit);
1177
1178 /*
1179  *
1180  */
1181
1182 int ip6_route_add(struct fib6_config *cfg)
1183 {
1184         int err;
1185         struct net *net = cfg->fc_nlinfo.nl_net;
1186         struct rt6_info *rt = NULL;
1187         struct net_device *dev = NULL;
1188         struct inet6_dev *idev = NULL;
1189         struct fib6_table *table;
1190         int addr_type;
1191
1192         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1193                 return -EINVAL;
1194 #ifndef CONFIG_IPV6_SUBTREES
1195         if (cfg->fc_src_len)
1196                 return -EINVAL;
1197 #endif
1198         if (cfg->fc_ifindex) {
1199                 err = -ENODEV;
1200                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1201                 if (!dev)
1202                         goto out;
1203                 idev = in6_dev_get(dev);
1204                 if (!idev)
1205                         goto out;
1206         }
1207
1208         if (cfg->fc_metric == 0)
1209                 cfg->fc_metric = IP6_RT_PRIO_USER;
1210
1211         table = fib6_new_table(net, cfg->fc_table);
1212         if (table == NULL) {
1213                 err = -ENOBUFS;
1214                 goto out;
1215         }
1216
1217         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL);
1218
1219         if (rt == NULL) {
1220                 err = -ENOMEM;
1221                 goto out;
1222         }
1223
1224         rt->dst.obsolete = -1;
1225         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1226                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1227                                 0;
1228
1229         if (cfg->fc_protocol == RTPROT_UNSPEC)
1230                 cfg->fc_protocol = RTPROT_BOOT;
1231         rt->rt6i_protocol = cfg->fc_protocol;
1232
1233         addr_type = ipv6_addr_type(&cfg->fc_dst);
1234
1235         if (addr_type & IPV6_ADDR_MULTICAST)
1236                 rt->dst.input = ip6_mc_input;
1237         else if (cfg->fc_flags & RTF_LOCAL)
1238                 rt->dst.input = ip6_input;
1239         else
1240                 rt->dst.input = ip6_forward;
1241
1242         rt->dst.output = ip6_output;
1243
1244         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1245         rt->rt6i_dst.plen = cfg->fc_dst_len;
1246         if (rt->rt6i_dst.plen == 128)
1247                rt->dst.flags = DST_HOST;
1248
1249 #ifdef CONFIG_IPV6_SUBTREES
1250         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1251         rt->rt6i_src.plen = cfg->fc_src_len;
1252 #endif
1253
1254         rt->rt6i_metric = cfg->fc_metric;
1255
1256         /* We cannot add true routes via loopback here,
1257            they would result in kernel looping; promote them to reject routes
1258          */
1259         if ((cfg->fc_flags & RTF_REJECT) ||
1260             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1261                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1262                 /* hold loopback dev/idev if we haven't done so. */
1263                 if (dev != net->loopback_dev) {
1264                         if (dev) {
1265                                 dev_put(dev);
1266                                 in6_dev_put(idev);
1267                         }
1268                         dev = net->loopback_dev;
1269                         dev_hold(dev);
1270                         idev = in6_dev_get(dev);
1271                         if (!idev) {
1272                                 err = -ENODEV;
1273                                 goto out;
1274                         }
1275                 }
1276                 rt->dst.output = ip6_pkt_discard_out;
1277                 rt->dst.input = ip6_pkt_discard;
1278                 rt->dst.error = -ENETUNREACH;
1279                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1280                 goto install_route;
1281         }
1282
1283         if (cfg->fc_flags & RTF_GATEWAY) {
1284                 const struct in6_addr *gw_addr;
1285                 int gwa_type;
1286
1287                 gw_addr = &cfg->fc_gateway;
1288                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1289                 gwa_type = ipv6_addr_type(gw_addr);
1290
1291                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1292                         struct rt6_info *grt;
1293
1294                         /* IPv6 strictly inhibits using not link-local
1295                            addresses as nexthop address.
1296                            Otherwise, router will not able to send redirects.
1297                            It is very good, but in some (rare!) circumstances
1298                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1299                            some exceptions. --ANK
1300                          */
1301                         err = -EINVAL;
1302                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1303                                 goto out;
1304
1305                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1306
1307                         err = -EHOSTUNREACH;
1308                         if (grt == NULL)
1309                                 goto out;
1310                         if (dev) {
1311                                 if (dev != grt->rt6i_dev) {
1312                                         dst_release(&grt->dst);
1313                                         goto out;
1314                                 }
1315                         } else {
1316                                 dev = grt->rt6i_dev;
1317                                 idev = grt->rt6i_idev;
1318                                 dev_hold(dev);
1319                                 in6_dev_hold(grt->rt6i_idev);
1320                         }
1321                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1322                                 err = 0;
1323                         dst_release(&grt->dst);
1324
1325                         if (err)
1326                                 goto out;
1327                 }
1328                 err = -EINVAL;
1329                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1330                         goto out;
1331         }
1332
1333         err = -ENODEV;
1334         if (dev == NULL)
1335                 goto out;
1336
1337         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1338                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1339                         err = -EINVAL;
1340                         goto out;
1341                 }
1342                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1343                 rt->rt6i_prefsrc.plen = 128;
1344         } else
1345                 rt->rt6i_prefsrc.plen = 0;
1346
1347         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1348                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1349                 if (IS_ERR(rt->rt6i_nexthop)) {
1350                         err = PTR_ERR(rt->rt6i_nexthop);
1351                         rt->rt6i_nexthop = NULL;
1352                         goto out;
1353                 }
1354         }
1355
1356         rt->rt6i_flags = cfg->fc_flags;
1357
1358 install_route:
1359         if (cfg->fc_mx) {
1360                 struct nlattr *nla;
1361                 int remaining;
1362
1363                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1364                         int type = nla_type(nla);
1365
1366                         if (type) {
1367                                 if (type > RTAX_MAX) {
1368                                         err = -EINVAL;
1369                                         goto out;
1370                                 }
1371
1372                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1373                         }
1374                 }
1375         }
1376
1377         rt->dst.dev = dev;
1378         rt->rt6i_idev = idev;
1379         rt->rt6i_table = table;
1380
1381         cfg->fc_nlinfo.nl_net = dev_net(dev);
1382
1383         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1384
1385 out:
1386         if (dev)
1387                 dev_put(dev);
1388         if (idev)
1389                 in6_dev_put(idev);
1390         if (rt)
1391                 dst_free(&rt->dst);
1392         return err;
1393 }
1394
1395 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1396 {
1397         int err;
1398         struct fib6_table *table;
1399         struct net *net = dev_net(rt->rt6i_dev);
1400
1401         if (rt == net->ipv6.ip6_null_entry)
1402                 return -ENOENT;
1403
1404         table = rt->rt6i_table;
1405         write_lock_bh(&table->tb6_lock);
1406
1407         err = fib6_del(rt, info);
1408         dst_release(&rt->dst);
1409
1410         write_unlock_bh(&table->tb6_lock);
1411
1412         return err;
1413 }
1414
1415 int ip6_del_rt(struct rt6_info *rt)
1416 {
1417         struct nl_info info = {
1418                 .nl_net = dev_net(rt->rt6i_dev),
1419         };
1420         return __ip6_del_rt(rt, &info);
1421 }
1422
1423 static int ip6_route_del(struct fib6_config *cfg)
1424 {
1425         struct fib6_table *table;
1426         struct fib6_node *fn;
1427         struct rt6_info *rt;
1428         int err = -ESRCH;
1429
1430         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1431         if (table == NULL)
1432                 return err;
1433
1434         read_lock_bh(&table->tb6_lock);
1435
1436         fn = fib6_locate(&table->tb6_root,
1437                          &cfg->fc_dst, cfg->fc_dst_len,
1438                          &cfg->fc_src, cfg->fc_src_len);
1439
1440         if (fn) {
1441                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1442                         if (cfg->fc_ifindex &&
1443                             (rt->rt6i_dev == NULL ||
1444                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1445                                 continue;
1446                         if (cfg->fc_flags & RTF_GATEWAY &&
1447                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1448                                 continue;
1449                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1450                                 continue;
1451                         dst_hold(&rt->dst);
1452                         read_unlock_bh(&table->tb6_lock);
1453
1454                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1455                 }
1456         }
1457         read_unlock_bh(&table->tb6_lock);
1458
1459         return err;
1460 }
1461
1462 /*
1463  *      Handle redirects
1464  */
1465 struct ip6rd_flowi {
1466         struct flowi6 fl6;
1467         struct in6_addr gateway;
1468 };
1469
1470 static struct rt6_info *__ip6_route_redirect(struct net *net,
1471                                              struct fib6_table *table,
1472                                              struct flowi6 *fl6,
1473                                              int flags)
1474 {
1475         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1476         struct rt6_info *rt;
1477         struct fib6_node *fn;
1478
1479         /*
1480          * Get the "current" route for this destination and
1481          * check if the redirect has come from approriate router.
1482          *
1483          * RFC 2461 specifies that redirects should only be
1484          * accepted if they come from the nexthop to the target.
1485          * Due to the way the routes are chosen, this notion
1486          * is a bit fuzzy and one might need to check all possible
1487          * routes.
1488          */
1489
1490         read_lock_bh(&table->tb6_lock);
1491         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1492 restart:
1493         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1494                 /*
1495                  * Current route is on-link; redirect is always invalid.
1496                  *
1497                  * Seems, previous statement is not true. It could
1498                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1499                  * But then router serving it might decide, that we should
1500                  * know truth 8)8) --ANK (980726).
1501                  */
1502                 if (rt6_check_expired(rt))
1503                         continue;
1504                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1505                         continue;
1506                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1507                         continue;
1508                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1509                         continue;
1510                 break;
1511         }
1512
1513         if (!rt)
1514                 rt = net->ipv6.ip6_null_entry;
1515         BACKTRACK(net, &fl6->saddr);
1516 out:
1517         dst_hold(&rt->dst);
1518
1519         read_unlock_bh(&table->tb6_lock);
1520
1521         return rt;
1522 };
1523
1524 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1525                                            const struct in6_addr *src,
1526                                            const struct in6_addr *gateway,
1527                                            struct net_device *dev)
1528 {
1529         int flags = RT6_LOOKUP_F_HAS_SADDR;
1530         struct net *net = dev_net(dev);
1531         struct ip6rd_flowi rdfl = {
1532                 .fl6 = {
1533                         .flowi6_oif = dev->ifindex,
1534                         .daddr = *dest,
1535                         .saddr = *src,
1536                 },
1537         };
1538
1539         ipv6_addr_copy(&rdfl.gateway, gateway);
1540
1541         if (rt6_need_strict(dest))
1542                 flags |= RT6_LOOKUP_F_IFACE;
1543
1544         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1545                                                    flags, __ip6_route_redirect);
1546 }
1547
1548 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1549                   const struct in6_addr *saddr,
1550                   struct neighbour *neigh, u8 *lladdr, int on_link)
1551 {
1552         struct rt6_info *rt, *nrt = NULL;
1553         struct netevent_redirect netevent;
1554         struct net *net = dev_net(neigh->dev);
1555
1556         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1557
1558         if (rt == net->ipv6.ip6_null_entry) {
1559                 if (net_ratelimit())
1560                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1561                                "for redirect target\n");
1562                 goto out;
1563         }
1564
1565         /*
1566          *      We have finally decided to accept it.
1567          */
1568
1569         neigh_update(neigh, lladdr, NUD_STALE,
1570                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1571                      NEIGH_UPDATE_F_OVERRIDE|
1572                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1573                                      NEIGH_UPDATE_F_ISROUTER))
1574                      );
1575
1576         /*
1577          * Redirect received -> path was valid.
1578          * Look, redirects are sent only in response to data packets,
1579          * so that this nexthop apparently is reachable. --ANK
1580          */
1581         dst_confirm(&rt->dst);
1582
1583         /* Duplicate redirect: silently ignore. */
1584         if (neigh == rt->dst.neighbour)
1585                 goto out;
1586
1587         nrt = ip6_rt_copy(rt);
1588         if (nrt == NULL)
1589                 goto out;
1590
1591         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1592         if (on_link)
1593                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1594
1595         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1596         nrt->rt6i_dst.plen = 128;
1597         nrt->dst.flags |= DST_HOST;
1598
1599         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1600         nrt->rt6i_nexthop = neigh_clone(neigh);
1601
1602         if (ip6_ins_rt(nrt))
1603                 goto out;
1604
1605         netevent.old = &rt->dst;
1606         netevent.new = &nrt->dst;
1607         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1608
1609         if (rt->rt6i_flags&RTF_CACHE) {
1610                 ip6_del_rt(rt);
1611                 return;
1612         }
1613
1614 out:
1615         dst_release(&rt->dst);
1616 }
1617
1618 /*
1619  *      Handle ICMP "packet too big" messages
1620  *      i.e. Path MTU discovery
1621  */
1622
1623 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1624                              struct net *net, u32 pmtu, int ifindex)
1625 {
1626         struct rt6_info *rt, *nrt;
1627         int allfrag = 0;
1628 again:
1629         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1630         if (rt == NULL)
1631                 return;
1632
1633         if (rt6_check_expired(rt)) {
1634                 ip6_del_rt(rt);
1635                 goto again;
1636         }
1637
1638         if (pmtu >= dst_mtu(&rt->dst))
1639                 goto out;
1640
1641         if (pmtu < IPV6_MIN_MTU) {
1642                 /*
1643                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1644                  * MTU (1280) and a fragment header should always be included
1645                  * after a node receiving Too Big message reporting PMTU is
1646                  * less than the IPv6 Minimum Link MTU.
1647                  */
1648                 pmtu = IPV6_MIN_MTU;
1649                 allfrag = 1;
1650         }
1651
1652         /* New mtu received -> path was valid.
1653            They are sent only in response to data packets,
1654            so that this nexthop apparently is reachable. --ANK
1655          */
1656         dst_confirm(&rt->dst);
1657
1658         /* Host route. If it is static, it would be better
1659            not to override it, but add new one, so that
1660            when cache entry will expire old pmtu
1661            would return automatically.
1662          */
1663         if (rt->rt6i_flags & RTF_CACHE) {
1664                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1665                 if (allfrag) {
1666                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1667                         features |= RTAX_FEATURE_ALLFRAG;
1668                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1669                 }
1670                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1671                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1672                 goto out;
1673         }
1674
1675         /* Network route.
1676            Two cases are possible:
1677            1. It is connected route. Action: COW
1678            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1679          */
1680         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1681                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1682         else
1683                 nrt = rt6_alloc_clone(rt, daddr);
1684
1685         if (nrt) {
1686                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1687                 if (allfrag) {
1688                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1689                         features |= RTAX_FEATURE_ALLFRAG;
1690                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1691                 }
1692
1693                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1694                  * happened within 5 mins, the recommended timer is 10 mins.
1695                  * Here this route expiration time is set to ip6_rt_mtu_expires
1696                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1697                  * and detecting PMTU increase will be automatically happened.
1698                  */
1699                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1700                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1701
1702                 ip6_ins_rt(nrt);
1703         }
1704 out:
1705         dst_release(&rt->dst);
1706 }
1707
1708 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1709                         struct net_device *dev, u32 pmtu)
1710 {
1711         struct net *net = dev_net(dev);
1712
1713         /*
1714          * RFC 1981 states that a node "MUST reduce the size of the packets it
1715          * is sending along the path" that caused the Packet Too Big message.
1716          * Since it's not possible in the general case to determine which
1717          * interface was used to send the original packet, we update the MTU
1718          * on the interface that will be used to send future packets. We also
1719          * update the MTU on the interface that received the Packet Too Big in
1720          * case the original packet was forced out that interface with
1721          * SO_BINDTODEVICE or similar. This is the next best thing to the
1722          * correct behaviour, which would be to update the MTU on all
1723          * interfaces.
1724          */
1725         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1726         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1727 }
1728
1729 /*
1730  *      Misc support functions
1731  */
1732
1733 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1734 {
1735         struct net *net = dev_net(ort->rt6i_dev);
1736         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1737                                             ort->dst.dev);
1738
1739         if (rt) {
1740                 rt->dst.input = ort->dst.input;
1741                 rt->dst.output = ort->dst.output;
1742
1743                 dst_copy_metrics(&rt->dst, &ort->dst);
1744                 rt->dst.error = ort->dst.error;
1745                 rt->rt6i_idev = ort->rt6i_idev;
1746                 if (rt->rt6i_idev)
1747                         in6_dev_hold(rt->rt6i_idev);
1748                 rt->dst.lastuse = jiffies;
1749                 rt->rt6i_expires = 0;
1750
1751                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1752                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1753                 rt->rt6i_metric = 0;
1754
1755                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1756 #ifdef CONFIG_IPV6_SUBTREES
1757                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1758 #endif
1759                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1760                 rt->rt6i_table = ort->rt6i_table;
1761         }
1762         return rt;
1763 }
1764
1765 #ifdef CONFIG_IPV6_ROUTE_INFO
1766 static struct rt6_info *rt6_get_route_info(struct net *net,
1767                                            const struct in6_addr *prefix, int prefixlen,
1768                                            const struct in6_addr *gwaddr, int ifindex)
1769 {
1770         struct fib6_node *fn;
1771         struct rt6_info *rt = NULL;
1772         struct fib6_table *table;
1773
1774         table = fib6_get_table(net, RT6_TABLE_INFO);
1775         if (table == NULL)
1776                 return NULL;
1777
1778         write_lock_bh(&table->tb6_lock);
1779         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1780         if (!fn)
1781                 goto out;
1782
1783         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1784                 if (rt->rt6i_dev->ifindex != ifindex)
1785                         continue;
1786                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1787                         continue;
1788                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1789                         continue;
1790                 dst_hold(&rt->dst);
1791                 break;
1792         }
1793 out:
1794         write_unlock_bh(&table->tb6_lock);
1795         return rt;
1796 }
1797
1798 static struct rt6_info *rt6_add_route_info(struct net *net,
1799                                            const struct in6_addr *prefix, int prefixlen,
1800                                            const struct in6_addr *gwaddr, int ifindex,
1801                                            unsigned pref)
1802 {
1803         struct fib6_config cfg = {
1804                 .fc_table       = RT6_TABLE_INFO,
1805                 .fc_metric      = IP6_RT_PRIO_USER,
1806                 .fc_ifindex     = ifindex,
1807                 .fc_dst_len     = prefixlen,
1808                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1809                                   RTF_UP | RTF_PREF(pref),
1810                 .fc_nlinfo.pid = 0,
1811                 .fc_nlinfo.nlh = NULL,
1812                 .fc_nlinfo.nl_net = net,
1813         };
1814
1815         ipv6_addr_copy(&cfg.fc_dst, prefix);
1816         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1817
1818         /* We should treat it as a default route if prefix length is 0. */
1819         if (!prefixlen)
1820                 cfg.fc_flags |= RTF_DEFAULT;
1821
1822         ip6_route_add(&cfg);
1823
1824         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1825 }
1826 #endif
1827
1828 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1829 {
1830         struct rt6_info *rt;
1831         struct fib6_table *table;
1832
1833         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1834         if (table == NULL)
1835                 return NULL;
1836
1837         write_lock_bh(&table->tb6_lock);
1838         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1839                 if (dev == rt->rt6i_dev &&
1840                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1841                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1842                         break;
1843         }
1844         if (rt)
1845                 dst_hold(&rt->dst);
1846         write_unlock_bh(&table->tb6_lock);
1847         return rt;
1848 }
1849
1850 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1851                                      struct net_device *dev,
1852                                      unsigned int pref)
1853 {
1854         struct fib6_config cfg = {
1855                 .fc_table       = RT6_TABLE_DFLT,
1856                 .fc_metric      = IP6_RT_PRIO_USER,
1857                 .fc_ifindex     = dev->ifindex,
1858                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1859                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1860                 .fc_nlinfo.pid = 0,
1861                 .fc_nlinfo.nlh = NULL,
1862                 .fc_nlinfo.nl_net = dev_net(dev),
1863         };
1864
1865         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1866
1867         ip6_route_add(&cfg);
1868
1869         return rt6_get_dflt_router(gwaddr, dev);
1870 }
1871
1872 void rt6_purge_dflt_routers(struct net *net)
1873 {
1874         struct rt6_info *rt;
1875         struct fib6_table *table;
1876
1877         /* NOTE: Keep consistent with rt6_get_dflt_router */
1878         table = fib6_get_table(net, RT6_TABLE_DFLT);
1879         if (table == NULL)
1880                 return;
1881
1882 restart:
1883         read_lock_bh(&table->tb6_lock);
1884         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1885                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1886                         dst_hold(&rt->dst);
1887                         read_unlock_bh(&table->tb6_lock);
1888                         ip6_del_rt(rt);
1889                         goto restart;
1890                 }
1891         }
1892         read_unlock_bh(&table->tb6_lock);
1893 }
1894
1895 static void rtmsg_to_fib6_config(struct net *net,
1896                                  struct in6_rtmsg *rtmsg,
1897                                  struct fib6_config *cfg)
1898 {
1899         memset(cfg, 0, sizeof(*cfg));
1900
1901         cfg->fc_table = RT6_TABLE_MAIN;
1902         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1903         cfg->fc_metric = rtmsg->rtmsg_metric;
1904         cfg->fc_expires = rtmsg->rtmsg_info;
1905         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1906         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1907         cfg->fc_flags = rtmsg->rtmsg_flags;
1908
1909         cfg->fc_nlinfo.nl_net = net;
1910
1911         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1912         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1913         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1914 }
1915
1916 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1917 {
1918         struct fib6_config cfg;
1919         struct in6_rtmsg rtmsg;
1920         int err;
1921
1922         switch(cmd) {
1923         case SIOCADDRT:         /* Add a route */
1924         case SIOCDELRT:         /* Delete a route */
1925                 if (!capable(CAP_NET_ADMIN))
1926                         return -EPERM;
1927                 err = copy_from_user(&rtmsg, arg,
1928                                      sizeof(struct in6_rtmsg));
1929                 if (err)
1930                         return -EFAULT;
1931
1932                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1933
1934                 rtnl_lock();
1935                 switch (cmd) {
1936                 case SIOCADDRT:
1937                         err = ip6_route_add(&cfg);
1938                         break;
1939                 case SIOCDELRT:
1940                         err = ip6_route_del(&cfg);
1941                         break;
1942                 default:
1943                         err = -EINVAL;
1944                 }
1945                 rtnl_unlock();
1946
1947                 return err;
1948         }
1949
1950         return -EINVAL;
1951 }
1952
1953 /*
1954  *      Drop the packet on the floor
1955  */
1956
1957 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1958 {
1959         int type;
1960         struct dst_entry *dst = skb_dst(skb);
1961         switch (ipstats_mib_noroutes) {
1962         case IPSTATS_MIB_INNOROUTES:
1963                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1964                 if (type == IPV6_ADDR_ANY) {
1965                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1966                                       IPSTATS_MIB_INADDRERRORS);
1967                         break;
1968                 }
1969                 /* FALLTHROUGH */
1970         case IPSTATS_MIB_OUTNOROUTES:
1971                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1972                               ipstats_mib_noroutes);
1973                 break;
1974         }
1975         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1976         kfree_skb(skb);
1977         return 0;
1978 }
1979
1980 static int ip6_pkt_discard(struct sk_buff *skb)
1981 {
1982         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1983 }
1984
1985 static int ip6_pkt_discard_out(struct sk_buff *skb)
1986 {
1987         skb->dev = skb_dst(skb)->dev;
1988         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1989 }
1990
1991 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1992
1993 static int ip6_pkt_prohibit(struct sk_buff *skb)
1994 {
1995         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1996 }
1997
1998 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1999 {
2000         skb->dev = skb_dst(skb)->dev;
2001         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2002 }
2003
2004 #endif
2005
2006 /*
2007  *      Allocate a dst for local (unicast / anycast) address.
2008  */
2009
2010 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2011                                     const struct in6_addr *addr,
2012                                     int anycast)
2013 {
2014         struct net *net = dev_net(idev->dev);
2015         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2016                                             net->loopback_dev);
2017         struct neighbour *neigh;
2018
2019         if (rt == NULL) {
2020                 if (net_ratelimit())
2021                         pr_warning("IPv6:  Maximum number of routes reached,"
2022                                    " consider increasing route/max_size.\n");
2023                 return ERR_PTR(-ENOMEM);
2024         }
2025
2026         in6_dev_hold(idev);
2027
2028         rt->dst.flags = DST_HOST;
2029         rt->dst.input = ip6_input;
2030         rt->dst.output = ip6_output;
2031         rt->rt6i_idev = idev;
2032         rt->dst.obsolete = -1;
2033
2034         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2035         if (anycast)
2036                 rt->rt6i_flags |= RTF_ANYCAST;
2037         else
2038                 rt->rt6i_flags |= RTF_LOCAL;
2039         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2040         if (IS_ERR(neigh)) {
2041                 dst_free(&rt->dst);
2042
2043                 return ERR_CAST(neigh);
2044         }
2045         rt->rt6i_nexthop = neigh;
2046
2047         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2048         rt->rt6i_dst.plen = 128;
2049         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2050
2051         atomic_set(&rt->dst.__refcnt, 1);
2052
2053         return rt;
2054 }
2055
2056 int ip6_route_get_saddr(struct net *net,
2057                         struct rt6_info *rt,
2058                         const struct in6_addr *daddr,
2059                         unsigned int prefs,
2060                         struct in6_addr *saddr)
2061 {
2062         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2063         int err = 0;
2064         if (rt->rt6i_prefsrc.plen)
2065                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2066         else
2067                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2068                                          daddr, prefs, saddr);
2069         return err;
2070 }
2071
2072 /* remove deleted ip from prefsrc entries */
2073 struct arg_dev_net_ip {
2074         struct net_device *dev;
2075         struct net *net;
2076         struct in6_addr *addr;
2077 };
2078
2079 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2080 {
2081         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2082         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2083         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2084
2085         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2086             rt != net->ipv6.ip6_null_entry &&
2087             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2088                 /* remove prefsrc entry */
2089                 rt->rt6i_prefsrc.plen = 0;
2090         }
2091         return 0;
2092 }
2093
2094 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2095 {
2096         struct net *net = dev_net(ifp->idev->dev);
2097         struct arg_dev_net_ip adni = {
2098                 .dev = ifp->idev->dev,
2099                 .net = net,
2100                 .addr = &ifp->addr,
2101         };
2102         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2103 }
2104
2105 struct arg_dev_net {
2106         struct net_device *dev;
2107         struct net *net;
2108 };
2109
2110 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2111 {
2112         const struct arg_dev_net *adn = arg;
2113         const struct net_device *dev = adn->dev;
2114
2115         if ((rt->rt6i_dev == dev || dev == NULL) &&
2116             rt != adn->net->ipv6.ip6_null_entry) {
2117                 RT6_TRACE("deleted by ifdown %p\n", rt);
2118                 return -1;
2119         }
2120         return 0;
2121 }
2122
2123 void rt6_ifdown(struct net *net, struct net_device *dev)
2124 {
2125         struct arg_dev_net adn = {
2126                 .dev = dev,
2127                 .net = net,
2128         };
2129
2130         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2131         icmp6_clean_all(fib6_ifdown, &adn);
2132 }
2133
2134 struct rt6_mtu_change_arg
2135 {
2136         struct net_device *dev;
2137         unsigned mtu;
2138 };
2139
2140 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2141 {
2142         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2143         struct inet6_dev *idev;
2144
2145         /* In IPv6 pmtu discovery is not optional,
2146            so that RTAX_MTU lock cannot disable it.
2147            We still use this lock to block changes
2148            caused by addrconf/ndisc.
2149         */
2150
2151         idev = __in6_dev_get(arg->dev);
2152         if (idev == NULL)
2153                 return 0;
2154
2155         /* For administrative MTU increase, there is no way to discover
2156            IPv6 PMTU increase, so PMTU increase should be updated here.
2157            Since RFC 1981 doesn't include administrative MTU increase
2158            update PMTU increase is a MUST. (i.e. jumbo frame)
2159          */
2160         /*
2161            If new MTU is less than route PMTU, this new MTU will be the
2162            lowest MTU in the path, update the route PMTU to reflect PMTU
2163            decreases; if new MTU is greater than route PMTU, and the
2164            old MTU is the lowest MTU in the path, update the route PMTU
2165            to reflect the increase. In this case if the other nodes' MTU
2166            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2167            PMTU discouvery.
2168          */
2169         if (rt->rt6i_dev == arg->dev &&
2170             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2171             (dst_mtu(&rt->dst) >= arg->mtu ||
2172              (dst_mtu(&rt->dst) < arg->mtu &&
2173               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2174                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2175         }
2176         return 0;
2177 }
2178
2179 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2180 {
2181         struct rt6_mtu_change_arg arg = {
2182                 .dev = dev,
2183                 .mtu = mtu,
2184         };
2185
2186         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2187 }
2188
2189 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2190         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2191         [RTA_OIF]               = { .type = NLA_U32 },
2192         [RTA_IIF]               = { .type = NLA_U32 },
2193         [RTA_PRIORITY]          = { .type = NLA_U32 },
2194         [RTA_METRICS]           = { .type = NLA_NESTED },
2195 };
2196
2197 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2198                               struct fib6_config *cfg)
2199 {
2200         struct rtmsg *rtm;
2201         struct nlattr *tb[RTA_MAX+1];
2202         int err;
2203
2204         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2205         if (err < 0)
2206                 goto errout;
2207
2208         err = -EINVAL;
2209         rtm = nlmsg_data(nlh);
2210         memset(cfg, 0, sizeof(*cfg));
2211
2212         cfg->fc_table = rtm->rtm_table;
2213         cfg->fc_dst_len = rtm->rtm_dst_len;
2214         cfg->fc_src_len = rtm->rtm_src_len;
2215         cfg->fc_flags = RTF_UP;
2216         cfg->fc_protocol = rtm->rtm_protocol;
2217
2218         if (rtm->rtm_type == RTN_UNREACHABLE)
2219                 cfg->fc_flags |= RTF_REJECT;
2220
2221         if (rtm->rtm_type == RTN_LOCAL)
2222                 cfg->fc_flags |= RTF_LOCAL;
2223
2224         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2225         cfg->fc_nlinfo.nlh = nlh;
2226         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2227
2228         if (tb[RTA_GATEWAY]) {
2229                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2230                 cfg->fc_flags |= RTF_GATEWAY;
2231         }
2232
2233         if (tb[RTA_DST]) {
2234                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2235
2236                 if (nla_len(tb[RTA_DST]) < plen)
2237                         goto errout;
2238
2239                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2240         }
2241
2242         if (tb[RTA_SRC]) {
2243                 int plen = (rtm->rtm_src_len + 7) >> 3;
2244
2245                 if (nla_len(tb[RTA_SRC]) < plen)
2246                         goto errout;
2247
2248                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2249         }
2250
2251         if (tb[RTA_PREFSRC])
2252                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2253
2254         if (tb[RTA_OIF])
2255                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2256
2257         if (tb[RTA_PRIORITY])
2258                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2259
2260         if (tb[RTA_METRICS]) {
2261                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2262                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2263         }
2264
2265         if (tb[RTA_TABLE])
2266                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2267
2268         err = 0;
2269 errout:
2270         return err;
2271 }
2272
2273 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2274 {
2275         struct fib6_config cfg;
2276         int err;
2277
2278         err = rtm_to_fib6_config(skb, nlh, &cfg);
2279         if (err < 0)
2280                 return err;
2281
2282         return ip6_route_del(&cfg);
2283 }
2284
2285 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2286 {
2287         struct fib6_config cfg;
2288         int err;
2289
2290         err = rtm_to_fib6_config(skb, nlh, &cfg);
2291         if (err < 0)
2292                 return err;
2293
2294         return ip6_route_add(&cfg);
2295 }
2296
2297 static inline size_t rt6_nlmsg_size(void)
2298 {
2299         return NLMSG_ALIGN(sizeof(struct rtmsg))
2300                + nla_total_size(16) /* RTA_SRC */
2301                + nla_total_size(16) /* RTA_DST */
2302                + nla_total_size(16) /* RTA_GATEWAY */
2303                + nla_total_size(16) /* RTA_PREFSRC */
2304                + nla_total_size(4) /* RTA_TABLE */
2305                + nla_total_size(4) /* RTA_IIF */
2306                + nla_total_size(4) /* RTA_OIF */
2307                + nla_total_size(4) /* RTA_PRIORITY */
2308                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2309                + nla_total_size(sizeof(struct rta_cacheinfo));
2310 }
2311
2312 static int rt6_fill_node(struct net *net,
2313                          struct sk_buff *skb, struct rt6_info *rt,
2314                          struct in6_addr *dst, struct in6_addr *src,
2315                          int iif, int type, u32 pid, u32 seq,
2316                          int prefix, int nowait, unsigned int flags)
2317 {
2318         struct rtmsg *rtm;
2319         struct nlmsghdr *nlh;
2320         long expires;
2321         u32 table;
2322
2323         if (prefix) {   /* user wants prefix routes only */
2324                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2325                         /* success since this is not a prefix route */
2326                         return 1;
2327                 }
2328         }
2329
2330         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2331         if (nlh == NULL)
2332                 return -EMSGSIZE;
2333
2334         rtm = nlmsg_data(nlh);
2335         rtm->rtm_family = AF_INET6;
2336         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2337         rtm->rtm_src_len = rt->rt6i_src.plen;
2338         rtm->rtm_tos = 0;
2339         if (rt->rt6i_table)
2340                 table = rt->rt6i_table->tb6_id;
2341         else
2342                 table = RT6_TABLE_UNSPEC;
2343         rtm->rtm_table = table;
2344         NLA_PUT_U32(skb, RTA_TABLE, table);
2345         if (rt->rt6i_flags&RTF_REJECT)
2346                 rtm->rtm_type = RTN_UNREACHABLE;
2347         else if (rt->rt6i_flags&RTF_LOCAL)
2348                 rtm->rtm_type = RTN_LOCAL;
2349         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2350                 rtm->rtm_type = RTN_LOCAL;
2351         else
2352                 rtm->rtm_type = RTN_UNICAST;
2353         rtm->rtm_flags = 0;
2354         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2355         rtm->rtm_protocol = rt->rt6i_protocol;
2356         if (rt->rt6i_flags&RTF_DYNAMIC)
2357                 rtm->rtm_protocol = RTPROT_REDIRECT;
2358         else if (rt->rt6i_flags & RTF_ADDRCONF)
2359                 rtm->rtm_protocol = RTPROT_KERNEL;
2360         else if (rt->rt6i_flags&RTF_DEFAULT)
2361                 rtm->rtm_protocol = RTPROT_RA;
2362
2363         if (rt->rt6i_flags&RTF_CACHE)
2364                 rtm->rtm_flags |= RTM_F_CLONED;
2365
2366         if (dst) {
2367                 NLA_PUT(skb, RTA_DST, 16, dst);
2368                 rtm->rtm_dst_len = 128;
2369         } else if (rtm->rtm_dst_len)
2370                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2371 #ifdef CONFIG_IPV6_SUBTREES
2372         if (src) {
2373                 NLA_PUT(skb, RTA_SRC, 16, src);
2374                 rtm->rtm_src_len = 128;
2375         } else if (rtm->rtm_src_len)
2376                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2377 #endif
2378         if (iif) {
2379 #ifdef CONFIG_IPV6_MROUTE
2380                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2381                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2382                         if (err <= 0) {
2383                                 if (!nowait) {
2384                                         if (err == 0)
2385                                                 return 0;
2386                                         goto nla_put_failure;
2387                                 } else {
2388                                         if (err == -EMSGSIZE)
2389                                                 goto nla_put_failure;
2390                                 }
2391                         }
2392                 } else
2393 #endif
2394                         NLA_PUT_U32(skb, RTA_IIF, iif);
2395         } else if (dst) {
2396                 struct in6_addr saddr_buf;
2397                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2398                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2399         }
2400
2401         if (rt->rt6i_prefsrc.plen) {
2402                 struct in6_addr saddr_buf;
2403                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2404                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2405         }
2406
2407         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2408                 goto nla_put_failure;
2409
2410         if (rt->dst.neighbour)
2411                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2412
2413         if (rt->dst.dev)
2414                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2415
2416         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2417
2418         if (!(rt->rt6i_flags & RTF_EXPIRES))
2419                 expires = 0;
2420         else if (rt->rt6i_expires - jiffies < INT_MAX)
2421                 expires = rt->rt6i_expires - jiffies;
2422         else
2423                 expires = INT_MAX;
2424
2425         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2426                                expires, rt->dst.error) < 0)
2427                 goto nla_put_failure;
2428
2429         return nlmsg_end(skb, nlh);
2430
2431 nla_put_failure:
2432         nlmsg_cancel(skb, nlh);
2433         return -EMSGSIZE;
2434 }
2435
2436 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2437 {
2438         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2439         int prefix;
2440
2441         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2442                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2443                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2444         } else
2445                 prefix = 0;
2446
2447         return rt6_fill_node(arg->net,
2448                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2449                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2450                      prefix, 0, NLM_F_MULTI);
2451 }
2452
2453 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2454 {
2455         struct net *net = sock_net(in_skb->sk);
2456         struct nlattr *tb[RTA_MAX+1];
2457         struct rt6_info *rt;
2458         struct sk_buff *skb;
2459         struct rtmsg *rtm;
2460         struct flowi6 fl6;
2461         int err, iif = 0;
2462
2463         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2464         if (err < 0)
2465                 goto errout;
2466
2467         err = -EINVAL;
2468         memset(&fl6, 0, sizeof(fl6));
2469
2470         if (tb[RTA_SRC]) {
2471                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2472                         goto errout;
2473
2474                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2475         }
2476
2477         if (tb[RTA_DST]) {
2478                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2479                         goto errout;
2480
2481                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2482         }
2483
2484         if (tb[RTA_IIF])
2485                 iif = nla_get_u32(tb[RTA_IIF]);
2486
2487         if (tb[RTA_OIF])
2488                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2489
2490         if (iif) {
2491                 struct net_device *dev;
2492                 dev = __dev_get_by_index(net, iif);
2493                 if (!dev) {
2494                         err = -ENODEV;
2495                         goto errout;
2496                 }
2497         }
2498
2499         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2500         if (skb == NULL) {
2501                 err = -ENOBUFS;
2502                 goto errout;
2503         }
2504
2505         /* Reserve room for dummy headers, this skb can pass
2506            through good chunk of routing engine.
2507          */
2508         skb_reset_mac_header(skb);
2509         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2510
2511         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2512         skb_dst_set(skb, &rt->dst);
2513
2514         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2515                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2516                             nlh->nlmsg_seq, 0, 0, 0);
2517         if (err < 0) {
2518                 kfree_skb(skb);
2519                 goto errout;
2520         }
2521
2522         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2523 errout:
2524         return err;
2525 }
2526
2527 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2528 {
2529         struct sk_buff *skb;
2530         struct net *net = info->nl_net;
2531         u32 seq;
2532         int err;
2533
2534         err = -ENOBUFS;
2535         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2536
2537         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2538         if (skb == NULL)
2539                 goto errout;
2540
2541         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2542                                 event, info->pid, seq, 0, 0, 0);
2543         if (err < 0) {
2544                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2545                 WARN_ON(err == -EMSGSIZE);
2546                 kfree_skb(skb);
2547                 goto errout;
2548         }
2549         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2550                     info->nlh, gfp_any());
2551         return;
2552 errout:
2553         if (err < 0)
2554                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2555 }
2556
2557 static int ip6_route_dev_notify(struct notifier_block *this,
2558                                 unsigned long event, void *data)
2559 {
2560         struct net_device *dev = (struct net_device *)data;
2561         struct net *net = dev_net(dev);
2562
2563         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2564                 net->ipv6.ip6_null_entry->dst.dev = dev;
2565                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2566 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2567                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2568                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2569                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2570                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2571 #endif
2572         }
2573
2574         return NOTIFY_OK;
2575 }
2576
2577 /*
2578  *      /proc
2579  */
2580
2581 #ifdef CONFIG_PROC_FS
2582
2583 struct rt6_proc_arg
2584 {
2585         char *buffer;
2586         int offset;
2587         int length;
2588         int skip;
2589         int len;
2590 };
2591
2592 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2593 {
2594         struct seq_file *m = p_arg;
2595
2596         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2597
2598 #ifdef CONFIG_IPV6_SUBTREES
2599         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2600 #else
2601         seq_puts(m, "00000000000000000000000000000000 00 ");
2602 #endif
2603
2604         if (rt->rt6i_nexthop) {
2605                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2606         } else {
2607                 seq_puts(m, "00000000000000000000000000000000");
2608         }
2609         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2610                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2611                    rt->dst.__use, rt->rt6i_flags,
2612                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2613         return 0;
2614 }
2615
2616 static int ipv6_route_show(struct seq_file *m, void *v)
2617 {
2618         struct net *net = (struct net *)m->private;
2619         fib6_clean_all(net, rt6_info_route, 0, m);
2620         return 0;
2621 }
2622
2623 static int ipv6_route_open(struct inode *inode, struct file *file)
2624 {
2625         return single_open_net(inode, file, ipv6_route_show);
2626 }
2627
2628 static const struct file_operations ipv6_route_proc_fops = {
2629         .owner          = THIS_MODULE,
2630         .open           = ipv6_route_open,
2631         .read           = seq_read,
2632         .llseek         = seq_lseek,
2633         .release        = single_release_net,
2634 };
2635
2636 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2637 {
2638         struct net *net = (struct net *)seq->private;
2639         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2640                    net->ipv6.rt6_stats->fib_nodes,
2641                    net->ipv6.rt6_stats->fib_route_nodes,
2642                    net->ipv6.rt6_stats->fib_rt_alloc,
2643                    net->ipv6.rt6_stats->fib_rt_entries,
2644                    net->ipv6.rt6_stats->fib_rt_cache,
2645                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2646                    net->ipv6.rt6_stats->fib_discarded_routes);
2647
2648         return 0;
2649 }
2650
2651 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2652 {
2653         return single_open_net(inode, file, rt6_stats_seq_show);
2654 }
2655
2656 static const struct file_operations rt6_stats_seq_fops = {
2657         .owner   = THIS_MODULE,
2658         .open    = rt6_stats_seq_open,
2659         .read    = seq_read,
2660         .llseek  = seq_lseek,
2661         .release = single_release_net,
2662 };
2663 #endif  /* CONFIG_PROC_FS */
2664
2665 #ifdef CONFIG_SYSCTL
2666
2667 static
2668 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2669                               void __user *buffer, size_t *lenp, loff_t *ppos)
2670 {
2671         struct net *net;
2672         int delay;
2673         if (!write)
2674                 return -EINVAL;
2675
2676         net = (struct net *)ctl->extra1;
2677         delay = net->ipv6.sysctl.flush_delay;
2678         proc_dointvec(ctl, write, buffer, lenp, ppos);
2679         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2680         return 0;
2681 }
2682
2683 ctl_table ipv6_route_table_template[] = {
2684         {
2685                 .procname       =       "flush",
2686                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2687                 .maxlen         =       sizeof(int),
2688                 .mode           =       0200,
2689                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2690         },
2691         {
2692                 .procname       =       "gc_thresh",
2693                 .data           =       &ip6_dst_ops_template.gc_thresh,
2694                 .maxlen         =       sizeof(int),
2695                 .mode           =       0644,
2696                 .proc_handler   =       proc_dointvec,
2697         },
2698         {
2699                 .procname       =       "max_size",
2700                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2701                 .maxlen         =       sizeof(int),
2702                 .mode           =       0644,
2703                 .proc_handler   =       proc_dointvec,
2704         },
2705         {
2706                 .procname       =       "gc_min_interval",
2707                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2708                 .maxlen         =       sizeof(int),
2709                 .mode           =       0644,
2710                 .proc_handler   =       proc_dointvec_jiffies,
2711         },
2712         {
2713                 .procname       =       "gc_timeout",
2714                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2715                 .maxlen         =       sizeof(int),
2716                 .mode           =       0644,
2717                 .proc_handler   =       proc_dointvec_jiffies,
2718         },
2719         {
2720                 .procname       =       "gc_interval",
2721                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2722                 .maxlen         =       sizeof(int),
2723                 .mode           =       0644,
2724                 .proc_handler   =       proc_dointvec_jiffies,
2725         },
2726         {
2727                 .procname       =       "gc_elasticity",
2728                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2729                 .maxlen         =       sizeof(int),
2730                 .mode           =       0644,
2731                 .proc_handler   =       proc_dointvec,
2732         },
2733         {
2734                 .procname       =       "mtu_expires",
2735                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2736                 .maxlen         =       sizeof(int),
2737                 .mode           =       0644,
2738                 .proc_handler   =       proc_dointvec_jiffies,
2739         },
2740         {
2741                 .procname       =       "min_adv_mss",
2742                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2743                 .maxlen         =       sizeof(int),
2744                 .mode           =       0644,
2745                 .proc_handler   =       proc_dointvec,
2746         },
2747         {
2748                 .procname       =       "gc_min_interval_ms",
2749                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2750                 .maxlen         =       sizeof(int),
2751                 .mode           =       0644,
2752                 .proc_handler   =       proc_dointvec_ms_jiffies,
2753         },
2754         { }
2755 };
2756
2757 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2758 {
2759         struct ctl_table *table;
2760
2761         table = kmemdup(ipv6_route_table_template,
2762                         sizeof(ipv6_route_table_template),
2763                         GFP_KERNEL);
2764
2765         if (table) {
2766                 table[0].data = &net->ipv6.sysctl.flush_delay;
2767                 table[0].extra1 = net;
2768                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2769                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2770                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2771                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2772                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2773                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2774                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2775                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2776                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2777         }
2778
2779         return table;
2780 }
2781 #endif
2782
2783 static int __net_init ip6_route_net_init(struct net *net)
2784 {
2785         int ret = -ENOMEM;
2786
2787         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2788                sizeof(net->ipv6.ip6_dst_ops));
2789
2790         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2791                 goto out_ip6_dst_ops;
2792
2793         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2794                                            sizeof(*net->ipv6.ip6_null_entry),
2795                                            GFP_KERNEL);
2796         if (!net->ipv6.ip6_null_entry)
2797                 goto out_ip6_dst_entries;
2798         net->ipv6.ip6_null_entry->dst.path =
2799                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2800         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2801         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2802                          ip6_template_metrics, true);
2803
2804 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2805         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2806                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2807                                                GFP_KERNEL);
2808         if (!net->ipv6.ip6_prohibit_entry)
2809                 goto out_ip6_null_entry;
2810         net->ipv6.ip6_prohibit_entry->dst.path =
2811                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2812         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2813         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2814                          ip6_template_metrics, true);
2815
2816         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2817                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2818                                                GFP_KERNEL);
2819         if (!net->ipv6.ip6_blk_hole_entry)
2820                 goto out_ip6_prohibit_entry;
2821         net->ipv6.ip6_blk_hole_entry->dst.path =
2822                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2823         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2824         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2825                          ip6_template_metrics, true);
2826 #endif
2827
2828         net->ipv6.sysctl.flush_delay = 0;
2829         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2830         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2831         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2832         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2833         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2834         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2835         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2836
2837 #ifdef CONFIG_PROC_FS
2838         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2839         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2840 #endif
2841         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2842
2843         ret = 0;
2844 out:
2845         return ret;
2846
2847 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2848 out_ip6_prohibit_entry:
2849         kfree(net->ipv6.ip6_prohibit_entry);
2850 out_ip6_null_entry:
2851         kfree(net->ipv6.ip6_null_entry);
2852 #endif
2853 out_ip6_dst_entries:
2854         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2855 out_ip6_dst_ops:
2856         goto out;
2857 }
2858
2859 static void __net_exit ip6_route_net_exit(struct net *net)
2860 {
2861 #ifdef CONFIG_PROC_FS
2862         proc_net_remove(net, "ipv6_route");
2863         proc_net_remove(net, "rt6_stats");
2864 #endif
2865         kfree(net->ipv6.ip6_null_entry);
2866 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2867         kfree(net->ipv6.ip6_prohibit_entry);
2868         kfree(net->ipv6.ip6_blk_hole_entry);
2869 #endif
2870         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2871 }
2872
2873 static struct pernet_operations ip6_route_net_ops = {
2874         .init = ip6_route_net_init,
2875         .exit = ip6_route_net_exit,
2876 };
2877
2878 static struct notifier_block ip6_route_dev_notifier = {
2879         .notifier_call = ip6_route_dev_notify,
2880         .priority = 0,
2881 };
2882
2883 int __init ip6_route_init(void)
2884 {
2885         int ret;
2886
2887         ret = -ENOMEM;
2888         ip6_dst_ops_template.kmem_cachep =
2889                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2890                                   SLAB_HWCACHE_ALIGN, NULL);
2891         if (!ip6_dst_ops_template.kmem_cachep)
2892                 goto out;
2893
2894         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2895         if (ret)
2896                 goto out_kmem_cache;
2897
2898         ret = register_pernet_subsys(&ip6_route_net_ops);
2899         if (ret)
2900                 goto out_dst_entries;
2901
2902         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2903
2904         /* Registering of the loopback is done before this portion of code,
2905          * the loopback reference in rt6_info will not be taken, do it
2906          * manually for init_net */
2907         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2908         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2909   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2910         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2911         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2912         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2913         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2914   #endif
2915         ret = fib6_init();
2916         if (ret)
2917                 goto out_register_subsys;
2918
2919         ret = xfrm6_init();
2920         if (ret)
2921                 goto out_fib6_init;
2922
2923         ret = fib6_rules_init();
2924         if (ret)
2925                 goto xfrm6_init;
2926
2927         ret = -ENOBUFS;
2928         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2929             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2930             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2931                 goto fib6_rules_init;
2932
2933         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2934         if (ret)
2935                 goto fib6_rules_init;
2936
2937 out:
2938         return ret;
2939
2940 fib6_rules_init:
2941         fib6_rules_cleanup();
2942 xfrm6_init:
2943         xfrm6_fini();
2944 out_fib6_init:
2945         fib6_gc_cleanup();
2946 out_register_subsys:
2947         unregister_pernet_subsys(&ip6_route_net_ops);
2948 out_dst_entries:
2949         dst_entries_destroy(&ip6_dst_blackhole_ops);
2950 out_kmem_cache:
2951         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2952         goto out;
2953 }
2954
2955 void ip6_route_cleanup(void)
2956 {
2957         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2958         fib6_rules_cleanup();
2959         xfrm6_fini();
2960         fib6_gc_cleanup();
2961         unregister_pernet_subsys(&ip6_route_net_ops);
2962         dst_entries_destroy(&ip6_dst_blackhole_ops);
2963         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2964 }