2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
112 #include <net/secure_seq.h>
114 #define RT_FL_TOS(oldflp4) \
115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117 #define IP_MAX_MTU 0xFFF0
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly = 8;
131 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly = 256;
134 static int rt_chain_length_max __read_mostly = 20;
135 static int redirect_genid;
137 static struct delayed_work expires_work;
138 static unsigned long expires_ljiffies;
141 * Interface to generic destination cache.
144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
146 static unsigned int ipv4_mtu(const struct dst_entry *dst);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149 static void ipv4_link_failure(struct sk_buff *skb);
150 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151 static int rt_garbage_collect(struct dst_ops *ops);
153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160 struct rtable *rt = (struct rtable *) dst;
161 struct inet_peer *peer;
165 rt_bind_peer(rt, rt->rt_dst, 1);
169 u32 *old_p = __DST_METRICS_PTR(old);
170 unsigned long prev, new;
173 if (inet_metrics_new(peer))
174 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
176 new = (unsigned long) p;
177 prev = cmpxchg(&dst->_metrics, old, new);
180 p = __DST_METRICS_PTR(prev);
181 if (prev & DST_METRICS_READ_ONLY)
185 fib_info_put(rt->fi);
193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
195 static struct dst_ops ipv4_dst_ops = {
197 .protocol = cpu_to_be16(ETH_P_IP),
198 .gc = rt_garbage_collect,
199 .check = ipv4_dst_check,
200 .default_advmss = ipv4_default_advmss,
202 .cow_metrics = ipv4_cow_metrics,
203 .destroy = ipv4_dst_destroy,
204 .ifdown = ipv4_dst_ifdown,
205 .negative_advice = ipv4_negative_advice,
206 .link_failure = ipv4_link_failure,
207 .update_pmtu = ip_rt_update_pmtu,
208 .local_out = __ip_local_out,
209 .neigh_lookup = ipv4_neigh_lookup,
212 #define ECN_OR_COST(class) TC_PRIO_##class
214 const __u8 ip_tos2prio[16] = {
216 ECN_OR_COST(BESTEFFORT),
218 ECN_OR_COST(BESTEFFORT),
224 ECN_OR_COST(INTERACTIVE),
226 ECN_OR_COST(INTERACTIVE),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK),
229 TC_PRIO_INTERACTIVE_BULK,
230 ECN_OR_COST(INTERACTIVE_BULK)
238 /* The locking scheme is rather straight forward:
240 * 1) Read-Copy Update protects the buckets of the central route hash.
241 * 2) Only writers remove entries, and they hold the lock
242 * as they look at rtable reference counts.
243 * 3) Only readers acquire references to rtable entries,
244 * they do so with atomic increments and with the
248 struct rt_hash_bucket {
249 struct rtable __rcu *chain;
252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 defined(CONFIG_PROVE_LOCKING)
255 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256 * The size of this table is a power of two and depends on the number of CPUS.
257 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
259 #ifdef CONFIG_LOCKDEP
260 # define RT_HASH_LOCK_SZ 256
263 # define RT_HASH_LOCK_SZ 4096
265 # define RT_HASH_LOCK_SZ 2048
267 # define RT_HASH_LOCK_SZ 1024
269 # define RT_HASH_LOCK_SZ 512
271 # define RT_HASH_LOCK_SZ 256
275 static spinlock_t *rt_hash_locks;
276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
278 static __init void rt_hash_lock_init(void)
282 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
285 panic("IP: failed to allocate rt_hash_locks\n");
287 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 spin_lock_init(&rt_hash_locks[i]);
291 # define rt_hash_lock_addr(slot) NULL
293 static inline void rt_hash_lock_init(void)
298 static struct rt_hash_bucket *rt_hash_table __read_mostly;
299 static unsigned rt_hash_mask __read_mostly;
300 static unsigned int rt_hash_log __read_mostly;
302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
308 return jhash_3words((__force u32)daddr, (__force u32)saddr,
313 static inline int rt_genid(struct net *net)
315 return atomic_read(&net->ipv4.rt_genid);
318 #ifdef CONFIG_PROC_FS
319 struct rt_cache_iter_state {
320 struct seq_net_private p;
325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
327 struct rt_cache_iter_state *st = seq->private;
328 struct rtable *r = NULL;
330 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
331 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
334 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
336 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
337 r->rt_genid == st->genid)
339 r = rcu_dereference_bh(r->dst.rt_next);
341 rcu_read_unlock_bh();
346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
349 struct rt_cache_iter_state *st = seq->private;
351 r = rcu_dereference_bh(r->dst.rt_next);
353 rcu_read_unlock_bh();
355 if (--st->bucket < 0)
357 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
359 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
367 struct rt_cache_iter_state *st = seq->private;
368 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
369 if (dev_net(r->dst.dev) != seq_file_net(seq))
371 if (r->rt_genid == st->genid)
377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
379 struct rtable *r = rt_cache_get_first(seq);
382 while (pos && (r = rt_cache_get_next(seq, r)))
384 return pos ? NULL : r;
387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
389 struct rt_cache_iter_state *st = seq->private;
391 return rt_cache_get_idx(seq, *pos - 1);
392 st->genid = rt_genid(seq_file_net(seq));
393 return SEQ_START_TOKEN;
396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
400 if (v == SEQ_START_TOKEN)
401 r = rt_cache_get_first(seq);
403 r = rt_cache_get_next(seq, v);
408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
410 if (v && v != SEQ_START_TOKEN)
411 rcu_read_unlock_bh();
414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
416 if (v == SEQ_START_TOKEN)
417 seq_printf(seq, "%-127s\n",
418 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
422 struct rtable *r = v;
427 n = dst_get_neighbour_noref(&r->dst);
428 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
431 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
433 r->dst.dev ? r->dst.dev->name : "*",
434 (__force u32)r->rt_dst,
435 (__force u32)r->rt_gateway,
436 r->rt_flags, atomic_read(&r->dst.__refcnt),
437 r->dst.__use, 0, (__force u32)r->rt_src,
438 dst_metric_advmss(&r->dst) + 40,
439 dst_metric(&r->dst, RTAX_WINDOW),
440 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 dst_metric(&r->dst, RTAX_RTTVAR)),
445 r->rt_spec_dst, &len);
447 seq_printf(seq, "%*s\n", 127 - len, "");
452 static const struct seq_operations rt_cache_seq_ops = {
453 .start = rt_cache_seq_start,
454 .next = rt_cache_seq_next,
455 .stop = rt_cache_seq_stop,
456 .show = rt_cache_seq_show,
459 static int rt_cache_seq_open(struct inode *inode, struct file *file)
461 return seq_open_net(inode, file, &rt_cache_seq_ops,
462 sizeof(struct rt_cache_iter_state));
465 static const struct file_operations rt_cache_seq_fops = {
466 .owner = THIS_MODULE,
467 .open = rt_cache_seq_open,
470 .release = seq_release_net,
474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
479 return SEQ_START_TOKEN;
481 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
482 if (!cpu_possible(cpu))
485 return &per_cpu(rt_cache_stat, cpu);
490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
494 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
495 if (!cpu_possible(cpu))
498 return &per_cpu(rt_cache_stat, cpu);
504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
509 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
511 struct rt_cache_stat *st = v;
513 if (v == SEQ_START_TOKEN) {
514 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
518 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
519 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
520 dst_entries_get_slow(&ipv4_dst_ops),
543 static const struct seq_operations rt_cpu_seq_ops = {
544 .start = rt_cpu_seq_start,
545 .next = rt_cpu_seq_next,
546 .stop = rt_cpu_seq_stop,
547 .show = rt_cpu_seq_show,
551 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
553 return seq_open(file, &rt_cpu_seq_ops);
556 static const struct file_operations rt_cpu_seq_fops = {
557 .owner = THIS_MODULE,
558 .open = rt_cpu_seq_open,
561 .release = seq_release,
564 #ifdef CONFIG_IP_ROUTE_CLASSID
565 static int rt_acct_proc_show(struct seq_file *m, void *v)
567 struct ip_rt_acct *dst, *src;
570 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
574 for_each_possible_cpu(i) {
575 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 for (j = 0; j < 256; j++) {
577 dst[j].o_bytes += src[j].o_bytes;
578 dst[j].o_packets += src[j].o_packets;
579 dst[j].i_bytes += src[j].i_bytes;
580 dst[j].i_packets += src[j].i_packets;
584 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
589 static int rt_acct_proc_open(struct inode *inode, struct file *file)
591 return single_open(file, rt_acct_proc_show, NULL);
594 static const struct file_operations rt_acct_proc_fops = {
595 .owner = THIS_MODULE,
596 .open = rt_acct_proc_open,
599 .release = single_release,
603 static int __net_init ip_rt_do_proc_init(struct net *net)
605 struct proc_dir_entry *pde;
607 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
612 pde = proc_create("rt_cache", S_IRUGO,
613 net->proc_net_stat, &rt_cpu_seq_fops);
617 #ifdef CONFIG_IP_ROUTE_CLASSID
618 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
624 #ifdef CONFIG_IP_ROUTE_CLASSID
626 remove_proc_entry("rt_cache", net->proc_net_stat);
629 remove_proc_entry("rt_cache", net->proc_net);
634 static void __net_exit ip_rt_do_proc_exit(struct net *net)
636 remove_proc_entry("rt_cache", net->proc_net_stat);
637 remove_proc_entry("rt_cache", net->proc_net);
638 #ifdef CONFIG_IP_ROUTE_CLASSID
639 remove_proc_entry("rt_acct", net->proc_net);
643 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
644 .init = ip_rt_do_proc_init,
645 .exit = ip_rt_do_proc_exit,
648 static int __init ip_rt_proc_init(void)
650 return register_pernet_subsys(&ip_rt_proc_ops);
654 static inline int ip_rt_proc_init(void)
658 #endif /* CONFIG_PROC_FS */
660 static inline void rt_free(struct rtable *rt)
662 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
665 static inline void rt_drop(struct rtable *rt)
668 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
671 static inline int rt_fast_clean(struct rtable *rth)
673 /* Kill broadcast/multicast entries very aggresively, if they
674 collide in hash table with more useful entries */
675 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
676 rt_is_input_route(rth) && rth->dst.rt_next;
679 static inline int rt_valuable(struct rtable *rth)
681 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
682 (rth->peer && rth->peer->pmtu_expires);
685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
690 if (atomic_read(&rth->dst.__refcnt))
693 age = jiffies - rth->dst.lastuse;
694 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 (age <= tmo2 && rt_valuable(rth)))
701 /* Bits of score are:
703 * 30: not quite useless
704 * 29..0: usage counter
706 static inline u32 rt_score(struct rtable *rt)
708 u32 score = jiffies - rt->dst.lastuse;
710 score = ~score & ~(3<<30);
715 if (rt_is_output_route(rt) ||
716 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
722 static inline bool rt_caching(const struct net *net)
724 return net->ipv4.current_rt_cache_rebuild_count <=
725 net->ipv4.sysctl_rt_cache_rebuild_count;
728 static inline bool compare_hash_inputs(const struct rtable *rt1,
729 const struct rtable *rt2)
731 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
733 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
738 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 (rt1->rt_mark ^ rt2->rt_mark) |
741 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
742 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
743 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
748 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
751 static inline int rt_is_expired(struct rtable *rth)
753 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
757 * Perform a full scan of hash table and free all entries.
758 * Can be called by a softirq or a process.
759 * In the later case, we want to be reschedule if necessary
761 static void rt_do_flush(struct net *net, int process_context)
764 struct rtable *rth, *next;
766 for (i = 0; i <= rt_hash_mask; i++) {
767 struct rtable __rcu **pprev;
770 if (process_context && need_resched())
772 rth = rcu_access_pointer(rt_hash_table[i].chain);
776 spin_lock_bh(rt_hash_lock_addr(i));
779 pprev = &rt_hash_table[i].chain;
780 rth = rcu_dereference_protected(*pprev,
781 lockdep_is_held(rt_hash_lock_addr(i)));
784 next = rcu_dereference_protected(rth->dst.rt_next,
785 lockdep_is_held(rt_hash_lock_addr(i)));
788 net_eq(dev_net(rth->dst.dev), net)) {
789 rcu_assign_pointer(*pprev, next);
790 rcu_assign_pointer(rth->dst.rt_next, list);
793 pprev = &rth->dst.rt_next;
798 spin_unlock_bh(rt_hash_lock_addr(i));
800 for (; list; list = next) {
801 next = rcu_dereference_protected(list->dst.rt_next, 1);
808 * While freeing expired entries, we compute average chain length
809 * and standard deviation, using fixed-point arithmetic.
810 * This to have an estimation of rt_chain_length_max
811 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
812 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
816 #define ONE (1UL << FRACT_BITS)
819 * Given a hash chain and an item in this hash chain,
820 * find if a previous entry has the same hash_inputs
821 * (but differs on tos, mark or oif)
822 * Returns 0 if an alias is found.
823 * Returns ONE if rth has no alias before itself.
825 static int has_noalias(const struct rtable *head, const struct rtable *rth)
827 const struct rtable *aux = head;
830 if (compare_hash_inputs(aux, rth))
832 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
837 static void rt_check_expire(void)
839 static unsigned int rover;
840 unsigned int i = rover, goal;
842 struct rtable __rcu **rthp;
843 unsigned long samples = 0;
844 unsigned long sum = 0, sum2 = 0;
848 delta = jiffies - expires_ljiffies;
849 expires_ljiffies = jiffies;
850 mult = ((u64)delta) << rt_hash_log;
851 if (ip_rt_gc_timeout > 1)
852 do_div(mult, ip_rt_gc_timeout);
853 goal = (unsigned int)mult;
854 if (goal > rt_hash_mask)
855 goal = rt_hash_mask + 1;
856 for (; goal > 0; goal--) {
857 unsigned long tmo = ip_rt_gc_timeout;
858 unsigned long length;
860 i = (i + 1) & rt_hash_mask;
861 rthp = &rt_hash_table[i].chain;
868 if (rcu_dereference_raw(*rthp) == NULL)
871 spin_lock_bh(rt_hash_lock_addr(i));
872 while ((rth = rcu_dereference_protected(*rthp,
873 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874 prefetch(rth->dst.rt_next);
875 if (rt_is_expired(rth)) {
876 *rthp = rth->dst.rt_next;
880 if (rth->dst.expires) {
881 /* Entry is expired even if it is in use */
882 if (time_before_eq(jiffies, rth->dst.expires)) {
885 rthp = &rth->dst.rt_next;
887 * We only count entries on
888 * a chain with equal hash inputs once
889 * so that entries for different QOS
890 * levels, and other non-hash input
891 * attributes don't unfairly skew
892 * the length computation
894 length += has_noalias(rt_hash_table[i].chain, rth);
897 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
900 /* Cleanup aged off entries. */
901 *rthp = rth->dst.rt_next;
904 spin_unlock_bh(rt_hash_lock_addr(i));
906 sum2 += length*length;
909 unsigned long avg = sum / samples;
910 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911 rt_chain_length_max = max_t(unsigned long,
913 (avg + 4*sd) >> FRACT_BITS);
919 * rt_worker_func() is run in process context.
920 * we call rt_check_expire() to scan part of the hash table
922 static void rt_worker_func(struct work_struct *work)
925 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
929 * Perturbation of rt_genid by a small quantity [1..256]
930 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931 * many times (2^24) without giving recent rt_genid.
932 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
934 static void rt_cache_invalidate(struct net *net)
936 unsigned char shuffle;
938 get_random_bytes(&shuffle, sizeof(shuffle));
939 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
944 * delay < 0 : invalidate cache (fast : entries will be deleted later)
945 * delay >= 0 : invalidate & flush cache (can be long)
947 void rt_cache_flush(struct net *net, int delay)
949 rt_cache_invalidate(net);
951 rt_do_flush(net, !in_softirq());
954 /* Flush previous cache invalidated entries from the cache */
955 void rt_cache_flush_batch(struct net *net)
957 rt_do_flush(net, !in_softirq());
960 static void rt_emergency_hash_rebuild(struct net *net)
963 printk(KERN_WARNING "Route hash chain too long!\n");
964 rt_cache_invalidate(net);
968 Short description of GC goals.
970 We want to build algorithm, which will keep routing cache
971 at some equilibrium point, when number of aged off entries
972 is kept approximately equal to newly generated ones.
974 Current expiration strength is variable "expire".
975 We try to adjust it dynamically, so that if networking
976 is idle expires is large enough to keep enough of warm entries,
977 and when load increases it reduces to limit cache size.
980 static int rt_garbage_collect(struct dst_ops *ops)
982 static unsigned long expire = RT_GC_TIMEOUT;
983 static unsigned long last_gc;
985 static int equilibrium;
987 struct rtable __rcu **rthp;
988 unsigned long now = jiffies;
990 int entries = dst_entries_get_fast(&ipv4_dst_ops);
993 * Garbage collection is pretty expensive,
994 * do not make it too frequently.
997 RT_CACHE_STAT_INC(gc_total);
999 if (now - last_gc < ip_rt_gc_min_interval &&
1000 entries < ip_rt_max_size) {
1001 RT_CACHE_STAT_INC(gc_ignored);
1005 entries = dst_entries_get_slow(&ipv4_dst_ops);
1006 /* Calculate number of entries, which we want to expire now. */
1007 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1009 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 equilibrium = ipv4_dst_ops.gc_thresh;
1011 goal = entries - equilibrium;
1013 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014 goal = entries - equilibrium;
1017 /* We are in dangerous area. Try to reduce cache really
1020 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021 equilibrium = entries - goal;
1024 if (now - last_gc >= ip_rt_gc_min_interval)
1028 equilibrium += goal;
1035 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 unsigned long tmo = expire;
1038 k = (k + 1) & rt_hash_mask;
1039 rthp = &rt_hash_table[k].chain;
1040 spin_lock_bh(rt_hash_lock_addr(k));
1041 while ((rth = rcu_dereference_protected(*rthp,
1042 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043 if (!rt_is_expired(rth) &&
1044 !rt_may_expire(rth, tmo, expire)) {
1046 rthp = &rth->dst.rt_next;
1049 *rthp = rth->dst.rt_next;
1053 spin_unlock_bh(rt_hash_lock_addr(k));
1062 /* Goal is not achieved. We stop process if:
1064 - if expire reduced to zero. Otherwise, expire is halfed.
1065 - if table is not full.
1066 - if we are called from interrupt.
1067 - jiffies check is just fallback/debug loop breaker.
1068 We will not spin here for long time in any case.
1071 RT_CACHE_STAT_INC(gc_goal_miss);
1078 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080 } while (!in_softirq() && time_before_eq(jiffies, now));
1082 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1086 if (net_ratelimit())
1087 printk(KERN_WARNING "dst cache overflow\n");
1088 RT_CACHE_STAT_INC(gc_dst_overflow);
1092 expire += ip_rt_gc_min_interval;
1093 if (expire > ip_rt_gc_timeout ||
1094 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1096 expire = ip_rt_gc_timeout;
1101 * Returns number of entries in a hash chain that have different hash_inputs
1103 static int slow_chain_length(const struct rtable *head)
1106 const struct rtable *rth = head;
1109 length += has_noalias(head, rth);
1110 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1112 return length >> FRACT_BITS;
1115 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1117 static const __be32 inaddr_any = 0;
1118 struct net_device *dev = dst->dev;
1119 const __be32 *pkey = daddr;
1120 const struct rtable *rt;
1121 struct neighbour *n;
1123 rt = (const struct rtable *) dst;
1125 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1127 else if (rt->rt_gateway)
1128 pkey = (const __be32 *) &rt->rt_gateway;
1130 n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1133 return neigh_create(&arp_tbl, pkey, dev);
1136 static int rt_bind_neighbour(struct rtable *rt)
1138 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1141 dst_set_neighbour(&rt->dst, n);
1146 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1147 struct sk_buff *skb, int ifindex)
1149 struct rtable *rth, *cand;
1150 struct rtable __rcu **rthp, **candp;
1154 int attempts = !in_softirq();
1158 min_score = ~(u32)0;
1163 if (!rt_caching(dev_net(rt->dst.dev))) {
1165 * If we're not caching, just tell the caller we
1166 * were successful and don't touch the route. The
1167 * caller hold the sole reference to the cache entry, and
1168 * it will be released when the caller is done with it.
1169 * If we drop it here, the callers have no way to resolve routes
1170 * when we're not caching. Instead, just point *rp at rt, so
1171 * the caller gets a single use out of the route
1172 * Note that we do rt_free on this new route entry, so that
1173 * once its refcount hits zero, we are still able to reap it
1175 * Note: To avoid expensive rcu stuff for this uncached dst,
1176 * we set DST_NOCACHE so that dst_release() can free dst without
1177 * waiting a grace period.
1180 rt->dst.flags |= DST_NOCACHE;
1181 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1182 int err = rt_bind_neighbour(rt);
1184 if (net_ratelimit())
1186 "Neighbour table failure & not caching routes.\n");
1188 return ERR_PTR(err);
1195 rthp = &rt_hash_table[hash].chain;
1197 spin_lock_bh(rt_hash_lock_addr(hash));
1198 while ((rth = rcu_dereference_protected(*rthp,
1199 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1200 if (rt_is_expired(rth)) {
1201 *rthp = rth->dst.rt_next;
1205 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1207 *rthp = rth->dst.rt_next;
1209 * Since lookup is lockfree, the deletion
1210 * must be visible to another weakly ordered CPU before
1211 * the insertion at the start of the hash chain.
1213 rcu_assign_pointer(rth->dst.rt_next,
1214 rt_hash_table[hash].chain);
1216 * Since lookup is lockfree, the update writes
1217 * must be ordered for consistency on SMP.
1219 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1221 dst_use(&rth->dst, now);
1222 spin_unlock_bh(rt_hash_lock_addr(hash));
1226 skb_dst_set(skb, &rth->dst);
1230 if (!atomic_read(&rth->dst.__refcnt)) {
1231 u32 score = rt_score(rth);
1233 if (score <= min_score) {
1242 rthp = &rth->dst.rt_next;
1246 /* ip_rt_gc_elasticity used to be average length of chain
1247 * length, when exceeded gc becomes really aggressive.
1249 * The second limit is less certain. At the moment it allows
1250 * only 2 entries per bucket. We will see.
1252 if (chain_length > ip_rt_gc_elasticity) {
1253 *candp = cand->dst.rt_next;
1257 if (chain_length > rt_chain_length_max &&
1258 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1259 struct net *net = dev_net(rt->dst.dev);
1260 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1261 if (!rt_caching(net)) {
1262 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1263 rt->dst.dev->name, num);
1265 rt_emergency_hash_rebuild(net);
1266 spin_unlock_bh(rt_hash_lock_addr(hash));
1268 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1269 ifindex, rt_genid(net));
1274 /* Try to bind route to arp only if it is output
1275 route or unicast forwarding path.
1277 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1278 int err = rt_bind_neighbour(rt);
1280 spin_unlock_bh(rt_hash_lock_addr(hash));
1282 if (err != -ENOBUFS) {
1284 return ERR_PTR(err);
1287 /* Neighbour tables are full and nothing
1288 can be released. Try to shrink route cache,
1289 it is most likely it holds some neighbour records.
1291 if (attempts-- > 0) {
1292 int saved_elasticity = ip_rt_gc_elasticity;
1293 int saved_int = ip_rt_gc_min_interval;
1294 ip_rt_gc_elasticity = 1;
1295 ip_rt_gc_min_interval = 0;
1296 rt_garbage_collect(&ipv4_dst_ops);
1297 ip_rt_gc_min_interval = saved_int;
1298 ip_rt_gc_elasticity = saved_elasticity;
1302 if (net_ratelimit())
1303 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1305 return ERR_PTR(-ENOBUFS);
1309 rt->dst.rt_next = rt_hash_table[hash].chain;
1312 * Since lookup is lockfree, we must make sure
1313 * previous writes to rt are committed to memory
1314 * before making rt visible to other CPUS.
1316 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1318 spin_unlock_bh(rt_hash_lock_addr(hash));
1322 skb_dst_set(skb, &rt->dst);
1326 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1328 static u32 rt_peer_genid(void)
1330 return atomic_read(&__rt_peer_genid);
1333 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1335 struct inet_peer *peer;
1337 peer = inet_getpeer_v4(daddr, create);
1339 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1342 rt->rt_peer_genid = rt_peer_genid();
1346 * Peer allocation may fail only in serious out-of-memory conditions. However
1347 * we still can generate some output.
1348 * Random ID selection looks a bit dangerous because we have no chances to
1349 * select ID being unique in a reasonable period of time.
1350 * But broken packet identifier may be better than no packet at all.
1352 static void ip_select_fb_ident(struct iphdr *iph)
1354 static DEFINE_SPINLOCK(ip_fb_id_lock);
1355 static u32 ip_fallback_id;
1358 spin_lock_bh(&ip_fb_id_lock);
1359 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1360 iph->id = htons(salt & 0xFFFF);
1361 ip_fallback_id = salt;
1362 spin_unlock_bh(&ip_fb_id_lock);
1365 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1367 struct rtable *rt = (struct rtable *) dst;
1369 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1370 if (rt->peer == NULL)
1371 rt_bind_peer(rt, rt->rt_dst, 1);
1373 /* If peer is attached to destination, it is never detached,
1374 so that we need not to grab a lock to dereference it.
1377 iph->id = htons(inet_getid(rt->peer, more));
1381 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1382 __builtin_return_address(0));
1384 ip_select_fb_ident(iph);
1386 EXPORT_SYMBOL(__ip_select_ident);
1388 static void rt_del(unsigned hash, struct rtable *rt)
1390 struct rtable __rcu **rthp;
1393 rthp = &rt_hash_table[hash].chain;
1394 spin_lock_bh(rt_hash_lock_addr(hash));
1396 while ((aux = rcu_dereference_protected(*rthp,
1397 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1398 if (aux == rt || rt_is_expired(aux)) {
1399 *rthp = aux->dst.rt_next;
1403 rthp = &aux->dst.rt_next;
1405 spin_unlock_bh(rt_hash_lock_addr(hash));
1408 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1410 struct rtable *rt = (struct rtable *) dst;
1411 __be32 orig_gw = rt->rt_gateway;
1412 struct neighbour *n, *old_n;
1414 dst_confirm(&rt->dst);
1416 rt->rt_gateway = peer->redirect_learned.a4;
1418 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1420 rt->rt_gateway = orig_gw;
1423 old_n = xchg(&rt->dst._neighbour, n);
1425 neigh_release(old_n);
1426 if (!(n->nud_state & NUD_VALID)) {
1427 neigh_event_send(n, NULL);
1429 rt->rt_flags |= RTCF_REDIRECTED;
1430 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1434 /* called in rcu_read_lock() section */
1435 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1436 __be32 saddr, struct net_device *dev)
1439 struct in_device *in_dev = __in_dev_get_rcu(dev);
1440 __be32 skeys[2] = { saddr, 0 };
1441 int ikeys[2] = { dev->ifindex, 0 };
1442 struct inet_peer *peer;
1449 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1450 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1451 ipv4_is_zeronet(new_gw))
1452 goto reject_redirect;
1454 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1455 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1456 goto reject_redirect;
1457 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1458 goto reject_redirect;
1460 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1461 goto reject_redirect;
1464 for (s = 0; s < 2; s++) {
1465 for (i = 0; i < 2; i++) {
1467 struct rtable __rcu **rthp;
1470 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1472 rthp = &rt_hash_table[hash].chain;
1474 while ((rt = rcu_dereference(*rthp)) != NULL) {
1475 rthp = &rt->dst.rt_next;
1477 if (rt->rt_key_dst != daddr ||
1478 rt->rt_key_src != skeys[s] ||
1479 rt->rt_oif != ikeys[i] ||
1480 rt_is_input_route(rt) ||
1481 rt_is_expired(rt) ||
1482 !net_eq(dev_net(rt->dst.dev), net) ||
1484 rt->dst.dev != dev ||
1485 rt->rt_gateway != old_gw)
1489 rt_bind_peer(rt, rt->rt_dst, 1);
1493 if (peer->redirect_learned.a4 != new_gw ||
1494 peer->redirect_genid != redirect_genid) {
1495 peer->redirect_learned.a4 = new_gw;
1496 peer->redirect_genid = redirect_genid;
1497 atomic_inc(&__rt_peer_genid);
1499 check_peer_redir(&rt->dst, peer);
1507 #ifdef CONFIG_IP_ROUTE_VERBOSE
1508 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1509 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1510 " Advised path = %pI4 -> %pI4\n",
1511 &old_gw, dev->name, &new_gw,
1517 static bool peer_pmtu_expired(struct inet_peer *peer)
1519 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1522 time_after_eq(jiffies, orig) &&
1523 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1526 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1528 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1531 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1534 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1536 struct rtable *rt = (struct rtable *)dst;
1537 struct dst_entry *ret = dst;
1540 if (dst->obsolete > 0) {
1543 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1544 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1546 rt_genid(dev_net(dst->dev)));
1549 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1550 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1558 * 1. The first ip_rt_redirect_number redirects are sent
1559 * with exponential backoff, then we stop sending them at all,
1560 * assuming that the host ignores our redirects.
1561 * 2. If we did not see packets requiring redirects
1562 * during ip_rt_redirect_silence, we assume that the host
1563 * forgot redirected route and start to send redirects again.
1565 * This algorithm is much cheaper and more intelligent than dumb load limiting
1568 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1569 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1572 void ip_rt_send_redirect(struct sk_buff *skb)
1574 struct rtable *rt = skb_rtable(skb);
1575 struct in_device *in_dev;
1576 struct inet_peer *peer;
1580 in_dev = __in_dev_get_rcu(rt->dst.dev);
1581 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1585 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1589 rt_bind_peer(rt, rt->rt_dst, 1);
1592 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1596 /* No redirected packets during ip_rt_redirect_silence;
1597 * reset the algorithm.
1599 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1600 peer->rate_tokens = 0;
1602 /* Too many ignored redirects; do not send anything
1603 * set dst.rate_last to the last seen redirected packet.
1605 if (peer->rate_tokens >= ip_rt_redirect_number) {
1606 peer->rate_last = jiffies;
1610 /* Check for load limit; set rate_last to the latest sent
1613 if (peer->rate_tokens == 0 ||
1616 (ip_rt_redirect_load << peer->rate_tokens)))) {
1617 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1618 peer->rate_last = jiffies;
1619 ++peer->rate_tokens;
1620 #ifdef CONFIG_IP_ROUTE_VERBOSE
1622 peer->rate_tokens == ip_rt_redirect_number &&
1624 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1625 &ip_hdr(skb)->saddr, rt->rt_iif,
1626 &rt->rt_dst, &rt->rt_gateway);
1631 static int ip_error(struct sk_buff *skb)
1633 struct rtable *rt = skb_rtable(skb);
1634 struct inet_peer *peer;
1639 switch (rt->dst.error) {
1644 code = ICMP_HOST_UNREACH;
1647 code = ICMP_NET_UNREACH;
1648 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1649 IPSTATS_MIB_INNOROUTES);
1652 code = ICMP_PKT_FILTERED;
1657 rt_bind_peer(rt, rt->rt_dst, 1);
1663 peer->rate_tokens += now - peer->rate_last;
1664 if (peer->rate_tokens > ip_rt_error_burst)
1665 peer->rate_tokens = ip_rt_error_burst;
1666 peer->rate_last = now;
1667 if (peer->rate_tokens >= ip_rt_error_cost)
1668 peer->rate_tokens -= ip_rt_error_cost;
1673 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1675 out: kfree_skb(skb);
1680 * The last two values are not from the RFC but
1681 * are needed for AMPRnet AX.25 paths.
1684 static const unsigned short mtu_plateau[] =
1685 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1687 static inline unsigned short guess_mtu(unsigned short old_mtu)
1691 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1692 if (old_mtu > mtu_plateau[i])
1693 return mtu_plateau[i];
1697 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1698 unsigned short new_mtu,
1699 struct net_device *dev)
1701 unsigned short old_mtu = ntohs(iph->tot_len);
1702 unsigned short est_mtu = 0;
1703 struct inet_peer *peer;
1705 peer = inet_getpeer_v4(iph->daddr, 1);
1707 unsigned short mtu = new_mtu;
1709 if (new_mtu < 68 || new_mtu >= old_mtu) {
1710 /* BSD 4.2 derived systems incorrectly adjust
1711 * tot_len by the IP header length, and report
1712 * a zero MTU in the ICMP message.
1715 old_mtu >= 68 + (iph->ihl << 2))
1716 old_mtu -= iph->ihl << 2;
1717 mtu = guess_mtu(old_mtu);
1720 if (mtu < ip_rt_min_pmtu)
1721 mtu = ip_rt_min_pmtu;
1722 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1723 unsigned long pmtu_expires;
1725 pmtu_expires = jiffies + ip_rt_mtu_expires;
1730 peer->pmtu_learned = mtu;
1731 peer->pmtu_expires = pmtu_expires;
1732 atomic_inc(&__rt_peer_genid);
1737 return est_mtu ? : new_mtu;
1740 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1742 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1746 if (time_before(jiffies, expires)) {
1747 u32 orig_dst_mtu = dst_mtu(dst);
1748 if (peer->pmtu_learned < orig_dst_mtu) {
1749 if (!peer->pmtu_orig)
1750 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1751 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1753 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1754 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1757 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1759 struct rtable *rt = (struct rtable *) dst;
1760 struct inet_peer *peer;
1765 rt_bind_peer(rt, rt->rt_dst, 1);
1768 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1770 if (mtu < ip_rt_min_pmtu)
1771 mtu = ip_rt_min_pmtu;
1772 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1774 pmtu_expires = jiffies + ip_rt_mtu_expires;
1778 peer->pmtu_learned = mtu;
1779 peer->pmtu_expires = pmtu_expires;
1781 atomic_inc(&__rt_peer_genid);
1782 rt->rt_peer_genid = rt_peer_genid();
1784 check_peer_pmtu(dst, peer);
1789 static void ipv4_validate_peer(struct rtable *rt)
1791 if (rt->rt_peer_genid != rt_peer_genid()) {
1792 struct inet_peer *peer;
1795 rt_bind_peer(rt, rt->rt_dst, 0);
1799 check_peer_pmtu(&rt->dst, peer);
1801 if (peer->redirect_genid != redirect_genid)
1802 peer->redirect_learned.a4 = 0;
1803 if (peer->redirect_learned.a4 &&
1804 peer->redirect_learned.a4 != rt->rt_gateway)
1805 check_peer_redir(&rt->dst, peer);
1808 rt->rt_peer_genid = rt_peer_genid();
1812 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1814 struct rtable *rt = (struct rtable *) dst;
1816 if (rt_is_expired(rt))
1818 ipv4_validate_peer(rt);
1822 static void ipv4_dst_destroy(struct dst_entry *dst)
1824 struct rtable *rt = (struct rtable *) dst;
1825 struct inet_peer *peer = rt->peer;
1828 fib_info_put(rt->fi);
1838 static void ipv4_link_failure(struct sk_buff *skb)
1842 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1844 rt = skb_rtable(skb);
1845 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1846 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1849 static int ip_rt_bug(struct sk_buff *skb)
1851 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1852 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1853 skb->dev ? skb->dev->name : "?");
1860 We do not cache source address of outgoing interface,
1861 because it is used only by IP RR, TS and SRR options,
1862 so that it out of fast path.
1864 BTW remember: "addr" is allowed to be not aligned
1868 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1872 if (rt_is_output_route(rt))
1873 src = ip_hdr(skb)->saddr;
1875 struct fib_result res;
1881 memset(&fl4, 0, sizeof(fl4));
1882 fl4.daddr = iph->daddr;
1883 fl4.saddr = iph->saddr;
1884 fl4.flowi4_tos = RT_TOS(iph->tos);
1885 fl4.flowi4_oif = rt->dst.dev->ifindex;
1886 fl4.flowi4_iif = skb->dev->ifindex;
1887 fl4.flowi4_mark = skb->mark;
1890 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1891 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1893 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1897 memcpy(addr, &src, 4);
1900 #ifdef CONFIG_IP_ROUTE_CLASSID
1901 static void set_class_tag(struct rtable *rt, u32 tag)
1903 if (!(rt->dst.tclassid & 0xFFFF))
1904 rt->dst.tclassid |= tag & 0xFFFF;
1905 if (!(rt->dst.tclassid & 0xFFFF0000))
1906 rt->dst.tclassid |= tag & 0xFFFF0000;
1910 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1912 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1915 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1917 if (advmss > 65535 - 40)
1918 advmss = 65535 - 40;
1923 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1925 const struct rtable *rt = (const struct rtable *) dst;
1926 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1928 if (mtu && rt_is_output_route(rt))
1931 mtu = dst->dev->mtu;
1933 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1935 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1939 if (mtu > IP_MAX_MTU)
1945 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1946 struct fib_info *fi)
1948 struct inet_peer *peer;
1951 /* If a peer entry exists for this destination, we must hook
1952 * it up in order to get at cached metrics.
1954 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1957 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1959 rt->rt_peer_genid = rt_peer_genid();
1960 if (inet_metrics_new(peer))
1961 memcpy(peer->metrics, fi->fib_metrics,
1962 sizeof(u32) * RTAX_MAX);
1963 dst_init_metrics(&rt->dst, peer->metrics, false);
1965 check_peer_pmtu(&rt->dst, peer);
1966 if (peer->redirect_genid != redirect_genid)
1967 peer->redirect_learned.a4 = 0;
1968 if (peer->redirect_learned.a4 &&
1969 peer->redirect_learned.a4 != rt->rt_gateway) {
1970 rt->rt_gateway = peer->redirect_learned.a4;
1971 rt->rt_flags |= RTCF_REDIRECTED;
1974 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1976 atomic_inc(&fi->fib_clntref);
1978 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1982 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1983 const struct fib_result *res,
1984 struct fib_info *fi, u16 type, u32 itag)
1986 struct dst_entry *dst = &rt->dst;
1989 if (FIB_RES_GW(*res) &&
1990 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1991 rt->rt_gateway = FIB_RES_GW(*res);
1992 rt_init_metrics(rt, fl4, fi);
1993 #ifdef CONFIG_IP_ROUTE_CLASSID
1994 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1998 if (dst_mtu(dst) > IP_MAX_MTU)
1999 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2000 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2001 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2003 #ifdef CONFIG_IP_ROUTE_CLASSID
2004 #ifdef CONFIG_IP_MULTIPLE_TABLES
2005 set_class_tag(rt, fib_rules_tclass(res));
2007 set_class_tag(rt, itag);
2011 static struct rtable *rt_dst_alloc(struct net_device *dev,
2012 bool nopolicy, bool noxfrm)
2014 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2016 (nopolicy ? DST_NOPOLICY : 0) |
2017 (noxfrm ? DST_NOXFRM : 0));
2020 /* called in rcu_read_lock() section */
2021 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2022 u8 tos, struct net_device *dev, int our)
2027 struct in_device *in_dev = __in_dev_get_rcu(dev);
2031 /* Primary sanity checks. */
2036 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2037 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2040 if (ipv4_is_zeronet(saddr)) {
2041 if (!ipv4_is_local_multicast(daddr))
2043 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2045 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2050 rth = rt_dst_alloc(init_net.loopback_dev,
2051 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2055 #ifdef CONFIG_IP_ROUTE_CLASSID
2056 rth->dst.tclassid = itag;
2058 rth->dst.output = ip_rt_bug;
2060 rth->rt_key_dst = daddr;
2061 rth->rt_key_src = saddr;
2062 rth->rt_genid = rt_genid(dev_net(dev));
2063 rth->rt_flags = RTCF_MULTICAST;
2064 rth->rt_type = RTN_MULTICAST;
2065 rth->rt_key_tos = tos;
2066 rth->rt_dst = daddr;
2067 rth->rt_src = saddr;
2068 rth->rt_route_iif = dev->ifindex;
2069 rth->rt_iif = dev->ifindex;
2071 rth->rt_mark = skb->mark;
2072 rth->rt_gateway = daddr;
2073 rth->rt_spec_dst= spec_dst;
2074 rth->rt_peer_genid = 0;
2078 rth->dst.input= ip_local_deliver;
2079 rth->rt_flags |= RTCF_LOCAL;
2082 #ifdef CONFIG_IP_MROUTE
2083 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2084 rth->dst.input = ip_mr_input;
2086 RT_CACHE_STAT_INC(in_slow_mc);
2088 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2089 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2090 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2101 static void ip_handle_martian_source(struct net_device *dev,
2102 struct in_device *in_dev,
2103 struct sk_buff *skb,
2107 RT_CACHE_STAT_INC(in_martian_src);
2108 #ifdef CONFIG_IP_ROUTE_VERBOSE
2109 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2111 * RFC1812 recommendation, if source is martian,
2112 * the only hint is MAC header.
2114 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2115 &daddr, &saddr, dev->name);
2116 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2118 const unsigned char *p = skb_mac_header(skb);
2119 printk(KERN_WARNING "ll header: ");
2120 for (i = 0; i < dev->hard_header_len; i++, p++) {
2122 if (i < (dev->hard_header_len - 1))
2131 /* called in rcu_read_lock() section */
2132 static int __mkroute_input(struct sk_buff *skb,
2133 const struct fib_result *res,
2134 struct in_device *in_dev,
2135 __be32 daddr, __be32 saddr, u32 tos,
2136 struct rtable **result)
2140 struct in_device *out_dev;
2141 unsigned int flags = 0;
2145 /* get a working reference to the output device */
2146 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2147 if (out_dev == NULL) {
2148 if (net_ratelimit())
2149 printk(KERN_CRIT "Bug in ip_route_input" \
2150 "_slow(). Please, report\n");
2155 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2156 in_dev->dev, &spec_dst, &itag);
2158 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2165 flags |= RTCF_DIRECTSRC;
2167 if (out_dev == in_dev && err &&
2168 (IN_DEV_SHARED_MEDIA(out_dev) ||
2169 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2170 flags |= RTCF_DOREDIRECT;
2172 if (skb->protocol != htons(ETH_P_IP)) {
2173 /* Not IP (i.e. ARP). Do not create route, if it is
2174 * invalid for proxy arp. DNAT routes are always valid.
2176 * Proxy arp feature have been extended to allow, ARP
2177 * replies back to the same interface, to support
2178 * Private VLAN switch technologies. See arp.c.
2180 if (out_dev == in_dev &&
2181 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2187 rth = rt_dst_alloc(out_dev->dev,
2188 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2189 IN_DEV_CONF_GET(out_dev, NOXFRM));
2195 rth->rt_key_dst = daddr;
2196 rth->rt_key_src = saddr;
2197 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2198 rth->rt_flags = flags;
2199 rth->rt_type = res->type;
2200 rth->rt_key_tos = tos;
2201 rth->rt_dst = daddr;
2202 rth->rt_src = saddr;
2203 rth->rt_route_iif = in_dev->dev->ifindex;
2204 rth->rt_iif = in_dev->dev->ifindex;
2206 rth->rt_mark = skb->mark;
2207 rth->rt_gateway = daddr;
2208 rth->rt_spec_dst= spec_dst;
2209 rth->rt_peer_genid = 0;
2213 rth->dst.input = ip_forward;
2214 rth->dst.output = ip_output;
2216 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2224 static int ip_mkroute_input(struct sk_buff *skb,
2225 struct fib_result *res,
2226 const struct flowi4 *fl4,
2227 struct in_device *in_dev,
2228 __be32 daddr, __be32 saddr, u32 tos)
2230 struct rtable* rth = NULL;
2234 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2235 if (res->fi && res->fi->fib_nhs > 1)
2236 fib_select_multipath(res);
2239 /* create a routing cache entry */
2240 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2244 /* put it into the cache */
2245 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2246 rt_genid(dev_net(rth->dst.dev)));
2247 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2249 return PTR_ERR(rth);
2254 * NOTE. We drop all the packets that has local source
2255 * addresses, because every properly looped back packet
2256 * must have correct destination already attached by output routine.
2258 * Such approach solves two big problems:
2259 * 1. Not simplex devices are handled properly.
2260 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2261 * called with rcu_read_lock()
2264 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2265 u8 tos, struct net_device *dev)
2267 struct fib_result res;
2268 struct in_device *in_dev = __in_dev_get_rcu(dev);
2272 struct rtable * rth;
2276 struct net * net = dev_net(dev);
2278 /* IP on this device is disabled. */
2283 /* Check for the most weird martians, which can be not detected
2287 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2288 ipv4_is_loopback(saddr))
2289 goto martian_source;
2291 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2294 /* Accept zero addresses only to limited broadcast;
2295 * I even do not know to fix it or not. Waiting for complains :-)
2297 if (ipv4_is_zeronet(saddr))
2298 goto martian_source;
2300 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2301 goto martian_destination;
2304 * Now we are ready to route packet.
2307 fl4.flowi4_iif = dev->ifindex;
2308 fl4.flowi4_mark = skb->mark;
2309 fl4.flowi4_tos = tos;
2310 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2313 err = fib_lookup(net, &fl4, &res);
2315 if (!IN_DEV_FORWARD(in_dev))
2320 RT_CACHE_STAT_INC(in_slow_tot);
2322 if (res.type == RTN_BROADCAST)
2325 if (res.type == RTN_LOCAL) {
2326 err = fib_validate_source(skb, saddr, daddr, tos,
2327 net->loopback_dev->ifindex,
2328 dev, &spec_dst, &itag);
2330 goto martian_source_keep_err;
2332 flags |= RTCF_DIRECTSRC;
2337 if (!IN_DEV_FORWARD(in_dev))
2339 if (res.type != RTN_UNICAST)
2340 goto martian_destination;
2342 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2346 if (skb->protocol != htons(ETH_P_IP))
2349 if (ipv4_is_zeronet(saddr))
2350 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2352 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2355 goto martian_source_keep_err;
2357 flags |= RTCF_DIRECTSRC;
2359 flags |= RTCF_BROADCAST;
2360 res.type = RTN_BROADCAST;
2361 RT_CACHE_STAT_INC(in_brd);
2364 rth = rt_dst_alloc(net->loopback_dev,
2365 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2369 rth->dst.input= ip_local_deliver;
2370 rth->dst.output= ip_rt_bug;
2371 #ifdef CONFIG_IP_ROUTE_CLASSID
2372 rth->dst.tclassid = itag;
2375 rth->rt_key_dst = daddr;
2376 rth->rt_key_src = saddr;
2377 rth->rt_genid = rt_genid(net);
2378 rth->rt_flags = flags|RTCF_LOCAL;
2379 rth->rt_type = res.type;
2380 rth->rt_key_tos = tos;
2381 rth->rt_dst = daddr;
2382 rth->rt_src = saddr;
2383 #ifdef CONFIG_IP_ROUTE_CLASSID
2384 rth->dst.tclassid = itag;
2386 rth->rt_route_iif = dev->ifindex;
2387 rth->rt_iif = dev->ifindex;
2389 rth->rt_mark = skb->mark;
2390 rth->rt_gateway = daddr;
2391 rth->rt_spec_dst= spec_dst;
2392 rth->rt_peer_genid = 0;
2395 if (res.type == RTN_UNREACHABLE) {
2396 rth->dst.input= ip_error;
2397 rth->dst.error= -err;
2398 rth->rt_flags &= ~RTCF_LOCAL;
2400 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2401 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2408 RT_CACHE_STAT_INC(in_no_route);
2409 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2410 res.type = RTN_UNREACHABLE;
2416 * Do not cache martian addresses: they should be logged (RFC1812)
2418 martian_destination:
2419 RT_CACHE_STAT_INC(in_martian_dst);
2420 #ifdef CONFIG_IP_ROUTE_VERBOSE
2421 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2422 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2423 &daddr, &saddr, dev->name);
2427 err = -EHOSTUNREACH;
2440 martian_source_keep_err:
2441 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2445 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2446 u8 tos, struct net_device *dev, bool noref)
2448 struct rtable * rth;
2450 int iif = dev->ifindex;
2458 if (!rt_caching(net))
2461 tos &= IPTOS_RT_MASK;
2462 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2464 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2465 rth = rcu_dereference(rth->dst.rt_next)) {
2466 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2467 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2468 (rth->rt_route_iif ^ iif) |
2469 (rth->rt_key_tos ^ tos)) == 0 &&
2470 rth->rt_mark == skb->mark &&
2471 net_eq(dev_net(rth->dst.dev), net) &&
2472 !rt_is_expired(rth)) {
2473 ipv4_validate_peer(rth);
2475 dst_use_noref(&rth->dst, jiffies);
2476 skb_dst_set_noref(skb, &rth->dst);
2478 dst_use(&rth->dst, jiffies);
2479 skb_dst_set(skb, &rth->dst);
2481 RT_CACHE_STAT_INC(in_hit);
2485 RT_CACHE_STAT_INC(in_hlist_search);
2489 /* Multicast recognition logic is moved from route cache to here.
2490 The problem was that too many Ethernet cards have broken/missing
2491 hardware multicast filters :-( As result the host on multicasting
2492 network acquires a lot of useless route cache entries, sort of
2493 SDR messages from all the world. Now we try to get rid of them.
2494 Really, provided software IP multicast filter is organized
2495 reasonably (at least, hashed), it does not result in a slowdown
2496 comparing with route cache reject entries.
2497 Note, that multicast routers are not affected, because
2498 route cache entry is created eventually.
2500 if (ipv4_is_multicast(daddr)) {
2501 struct in_device *in_dev = __in_dev_get_rcu(dev);
2504 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2505 ip_hdr(skb)->protocol);
2507 #ifdef CONFIG_IP_MROUTE
2509 (!ipv4_is_local_multicast(daddr) &&
2510 IN_DEV_MFORWARD(in_dev))
2513 int res = ip_route_input_mc(skb, daddr, saddr,
2522 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2526 EXPORT_SYMBOL(ip_route_input_common);
2528 /* called with rcu_read_lock() */
2529 static struct rtable *__mkroute_output(const struct fib_result *res,
2530 const struct flowi4 *fl4,
2531 __be32 orig_daddr, __be32 orig_saddr,
2532 int orig_oif, __u8 orig_rtos,
2533 struct net_device *dev_out,
2536 struct fib_info *fi = res->fi;
2537 struct in_device *in_dev;
2538 u16 type = res->type;
2541 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2542 return ERR_PTR(-EINVAL);
2544 if (ipv4_is_lbcast(fl4->daddr))
2545 type = RTN_BROADCAST;
2546 else if (ipv4_is_multicast(fl4->daddr))
2547 type = RTN_MULTICAST;
2548 else if (ipv4_is_zeronet(fl4->daddr))
2549 return ERR_PTR(-EINVAL);
2551 if (dev_out->flags & IFF_LOOPBACK)
2552 flags |= RTCF_LOCAL;
2554 in_dev = __in_dev_get_rcu(dev_out);
2556 return ERR_PTR(-EINVAL);
2558 if (type == RTN_BROADCAST) {
2559 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2561 } else if (type == RTN_MULTICAST) {
2562 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2563 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2565 flags &= ~RTCF_LOCAL;
2566 /* If multicast route do not exist use
2567 * default one, but do not gateway in this case.
2570 if (fi && res->prefixlen < 4)
2574 rth = rt_dst_alloc(dev_out,
2575 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2576 IN_DEV_CONF_GET(in_dev, NOXFRM));
2578 return ERR_PTR(-ENOBUFS);
2580 rth->dst.output = ip_output;
2582 rth->rt_key_dst = orig_daddr;
2583 rth->rt_key_src = orig_saddr;
2584 rth->rt_genid = rt_genid(dev_net(dev_out));
2585 rth->rt_flags = flags;
2586 rth->rt_type = type;
2587 rth->rt_key_tos = orig_rtos;
2588 rth->rt_dst = fl4->daddr;
2589 rth->rt_src = fl4->saddr;
2590 rth->rt_route_iif = 0;
2591 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2592 rth->rt_oif = orig_oif;
2593 rth->rt_mark = fl4->flowi4_mark;
2594 rth->rt_gateway = fl4->daddr;
2595 rth->rt_spec_dst= fl4->saddr;
2596 rth->rt_peer_genid = 0;
2600 RT_CACHE_STAT_INC(out_slow_tot);
2602 if (flags & RTCF_LOCAL) {
2603 rth->dst.input = ip_local_deliver;
2604 rth->rt_spec_dst = fl4->daddr;
2606 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2607 rth->rt_spec_dst = fl4->saddr;
2608 if (flags & RTCF_LOCAL &&
2609 !(dev_out->flags & IFF_LOOPBACK)) {
2610 rth->dst.output = ip_mc_output;
2611 RT_CACHE_STAT_INC(out_slow_mc);
2613 #ifdef CONFIG_IP_MROUTE
2614 if (type == RTN_MULTICAST) {
2615 if (IN_DEV_MFORWARD(in_dev) &&
2616 !ipv4_is_local_multicast(fl4->daddr)) {
2617 rth->dst.input = ip_mr_input;
2618 rth->dst.output = ip_mc_output;
2624 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2630 * Major route resolver routine.
2631 * called with rcu_read_lock();
2634 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2636 struct net_device *dev_out = NULL;
2637 __u8 tos = RT_FL_TOS(fl4);
2638 unsigned int flags = 0;
2639 struct fib_result res;
2646 #ifdef CONFIG_IP_MULTIPLE_TABLES
2650 orig_daddr = fl4->daddr;
2651 orig_saddr = fl4->saddr;
2652 orig_oif = fl4->flowi4_oif;
2654 fl4->flowi4_iif = net->loopback_dev->ifindex;
2655 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2656 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2657 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2661 rth = ERR_PTR(-EINVAL);
2662 if (ipv4_is_multicast(fl4->saddr) ||
2663 ipv4_is_lbcast(fl4->saddr) ||
2664 ipv4_is_zeronet(fl4->saddr))
2667 /* I removed check for oif == dev_out->oif here.
2668 It was wrong for two reasons:
2669 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2670 is assigned to multiple interfaces.
2671 2. Moreover, we are allowed to send packets with saddr
2672 of another iface. --ANK
2675 if (fl4->flowi4_oif == 0 &&
2676 (ipv4_is_multicast(fl4->daddr) ||
2677 ipv4_is_lbcast(fl4->daddr))) {
2678 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2679 dev_out = __ip_dev_find(net, fl4->saddr, false);
2680 if (dev_out == NULL)
2683 /* Special hack: user can direct multicasts
2684 and limited broadcast via necessary interface
2685 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2686 This hack is not just for fun, it allows
2687 vic,vat and friends to work.
2688 They bind socket to loopback, set ttl to zero
2689 and expect that it will work.
2690 From the viewpoint of routing cache they are broken,
2691 because we are not allowed to build multicast path
2692 with loopback source addr (look, routing cache
2693 cannot know, that ttl is zero, so that packet
2694 will not leave this host and route is valid).
2695 Luckily, this hack is good workaround.
2698 fl4->flowi4_oif = dev_out->ifindex;
2702 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2703 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2704 if (!__ip_dev_find(net, fl4->saddr, false))
2710 if (fl4->flowi4_oif) {
2711 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2712 rth = ERR_PTR(-ENODEV);
2713 if (dev_out == NULL)
2716 /* RACE: Check return value of inet_select_addr instead. */
2717 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2718 rth = ERR_PTR(-ENETUNREACH);
2721 if (ipv4_is_local_multicast(fl4->daddr) ||
2722 ipv4_is_lbcast(fl4->daddr)) {
2724 fl4->saddr = inet_select_addr(dev_out, 0,
2729 if (ipv4_is_multicast(fl4->daddr))
2730 fl4->saddr = inet_select_addr(dev_out, 0,
2732 else if (!fl4->daddr)
2733 fl4->saddr = inet_select_addr(dev_out, 0,
2739 fl4->daddr = fl4->saddr;
2741 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2742 dev_out = net->loopback_dev;
2743 fl4->flowi4_oif = net->loopback_dev->ifindex;
2744 res.type = RTN_LOCAL;
2745 flags |= RTCF_LOCAL;
2749 if (fib_lookup(net, fl4, &res)) {
2751 if (fl4->flowi4_oif) {
2752 /* Apparently, routing tables are wrong. Assume,
2753 that the destination is on link.
2756 Because we are allowed to send to iface
2757 even if it has NO routes and NO assigned
2758 addresses. When oif is specified, routing
2759 tables are looked up with only one purpose:
2760 to catch if destination is gatewayed, rather than
2761 direct. Moreover, if MSG_DONTROUTE is set,
2762 we send packet, ignoring both routing tables
2763 and ifaddr state. --ANK
2766 We could make it even if oif is unknown,
2767 likely IPv6, but we do not.
2770 if (fl4->saddr == 0)
2771 fl4->saddr = inet_select_addr(dev_out, 0,
2773 res.type = RTN_UNICAST;
2776 rth = ERR_PTR(-ENETUNREACH);
2780 if (res.type == RTN_LOCAL) {
2782 if (res.fi->fib_prefsrc)
2783 fl4->saddr = res.fi->fib_prefsrc;
2785 fl4->saddr = fl4->daddr;
2787 dev_out = net->loopback_dev;
2788 fl4->flowi4_oif = dev_out->ifindex;
2790 flags |= RTCF_LOCAL;
2794 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2795 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2796 fib_select_multipath(&res);
2799 if (!res.prefixlen &&
2800 res.table->tb_num_default > 1 &&
2801 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2802 fib_select_default(&res);
2805 fl4->saddr = FIB_RES_PREFSRC(net, res);
2807 dev_out = FIB_RES_DEV(res);
2808 fl4->flowi4_oif = dev_out->ifindex;
2812 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2813 tos, dev_out, flags);
2817 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2818 rt_genid(dev_net(dev_out)));
2819 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2827 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2832 if (!rt_caching(net))
2835 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2838 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2839 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2840 if (rth->rt_key_dst == flp4->daddr &&
2841 rth->rt_key_src == flp4->saddr &&
2842 rt_is_output_route(rth) &&
2843 rth->rt_oif == flp4->flowi4_oif &&
2844 rth->rt_mark == flp4->flowi4_mark &&
2845 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2846 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2847 net_eq(dev_net(rth->dst.dev), net) &&
2848 !rt_is_expired(rth)) {
2849 ipv4_validate_peer(rth);
2850 dst_use(&rth->dst, jiffies);
2851 RT_CACHE_STAT_INC(out_hit);
2852 rcu_read_unlock_bh();
2854 flp4->saddr = rth->rt_src;
2856 flp4->daddr = rth->rt_dst;
2859 RT_CACHE_STAT_INC(out_hlist_search);
2861 rcu_read_unlock_bh();
2864 return ip_route_output_slow(net, flp4);
2866 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2868 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2873 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2875 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2877 return mtu ? : dst->dev->mtu;
2880 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2884 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2890 static struct dst_ops ipv4_dst_blackhole_ops = {
2892 .protocol = cpu_to_be16(ETH_P_IP),
2893 .destroy = ipv4_dst_destroy,
2894 .check = ipv4_blackhole_dst_check,
2895 .mtu = ipv4_blackhole_mtu,
2896 .default_advmss = ipv4_default_advmss,
2897 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2898 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2899 .neigh_lookup = ipv4_neigh_lookup,
2902 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2904 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2905 struct rtable *ort = (struct rtable *) dst_orig;
2908 struct dst_entry *new = &rt->dst;
2911 new->input = dst_discard;
2912 new->output = dst_discard;
2913 dst_copy_metrics(new, &ort->dst);
2915 new->dev = ort->dst.dev;
2919 rt->rt_key_dst = ort->rt_key_dst;
2920 rt->rt_key_src = ort->rt_key_src;
2921 rt->rt_key_tos = ort->rt_key_tos;
2922 rt->rt_route_iif = ort->rt_route_iif;
2923 rt->rt_iif = ort->rt_iif;
2924 rt->rt_oif = ort->rt_oif;
2925 rt->rt_mark = ort->rt_mark;
2927 rt->rt_genid = rt_genid(net);
2928 rt->rt_flags = ort->rt_flags;
2929 rt->rt_type = ort->rt_type;
2930 rt->rt_dst = ort->rt_dst;
2931 rt->rt_src = ort->rt_src;
2932 rt->rt_gateway = ort->rt_gateway;
2933 rt->rt_spec_dst = ort->rt_spec_dst;
2934 rt->peer = ort->peer;
2936 atomic_inc(&rt->peer->refcnt);
2939 atomic_inc(&rt->fi->fib_clntref);
2944 dst_release(dst_orig);
2946 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2949 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2952 struct rtable *rt = __ip_route_output_key(net, flp4);
2957 if (flp4->flowi4_proto)
2958 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2959 flowi4_to_flowi(flp4),
2964 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2966 static int rt_fill_info(struct net *net,
2967 struct sk_buff *skb, u32 pid, u32 seq, int event,
2968 int nowait, unsigned int flags)
2970 struct rtable *rt = skb_rtable(skb);
2972 struct nlmsghdr *nlh;
2973 unsigned long expires = 0;
2974 const struct inet_peer *peer = rt->peer;
2975 u32 id = 0, ts = 0, tsage = 0, error;
2977 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2981 r = nlmsg_data(nlh);
2982 r->rtm_family = AF_INET;
2983 r->rtm_dst_len = 32;
2985 r->rtm_tos = rt->rt_key_tos;
2986 r->rtm_table = RT_TABLE_MAIN;
2987 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2988 r->rtm_type = rt->rt_type;
2989 r->rtm_scope = RT_SCOPE_UNIVERSE;
2990 r->rtm_protocol = RTPROT_UNSPEC;
2991 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2992 if (rt->rt_flags & RTCF_NOTIFY)
2993 r->rtm_flags |= RTM_F_NOTIFY;
2995 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2997 if (rt->rt_key_src) {
2998 r->rtm_src_len = 32;
2999 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3002 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3003 #ifdef CONFIG_IP_ROUTE_CLASSID
3004 if (rt->dst.tclassid)
3005 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3007 if (rt_is_input_route(rt))
3008 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3009 else if (rt->rt_src != rt->rt_key_src)
3010 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3012 if (rt->rt_dst != rt->rt_gateway)
3013 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3015 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3016 goto nla_put_failure;
3019 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3021 error = rt->dst.error;
3023 inet_peer_refcheck(rt->peer);
3024 id = atomic_read(&peer->ip_id_count) & 0xffff;
3025 if (peer->tcp_ts_stamp) {
3027 tsage = get_seconds() - peer->tcp_ts_stamp;
3029 expires = ACCESS_ONCE(peer->pmtu_expires);
3031 if (time_before(jiffies, expires))
3038 if (rt_is_input_route(rt)) {
3039 #ifdef CONFIG_IP_MROUTE
3040 __be32 dst = rt->rt_dst;
3042 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3043 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3044 int err = ipmr_get_route(net, skb,
3045 rt->rt_src, rt->rt_dst,
3051 goto nla_put_failure;
3053 if (err == -EMSGSIZE)
3054 goto nla_put_failure;
3060 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3063 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3064 expires, error) < 0)
3065 goto nla_put_failure;
3067 return nlmsg_end(skb, nlh);
3070 nlmsg_cancel(skb, nlh);
3074 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3076 struct net *net = sock_net(in_skb->sk);
3078 struct nlattr *tb[RTA_MAX+1];
3079 struct rtable *rt = NULL;
3085 struct sk_buff *skb;
3087 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3091 rtm = nlmsg_data(nlh);
3093 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3099 /* Reserve room for dummy headers, this skb can pass
3100 through good chunk of routing engine.
3102 skb_reset_mac_header(skb);
3103 skb_reset_network_header(skb);
3105 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3106 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3107 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3109 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3110 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3111 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3112 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3115 struct net_device *dev;
3117 dev = __dev_get_by_index(net, iif);
3123 skb->protocol = htons(ETH_P_IP);
3127 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3130 rt = skb_rtable(skb);
3131 if (err == 0 && rt->dst.error)
3132 err = -rt->dst.error;
3134 struct flowi4 fl4 = {
3137 .flowi4_tos = rtm->rtm_tos,
3138 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3139 .flowi4_mark = mark,
3141 rt = ip_route_output_key(net, &fl4);
3151 skb_dst_set(skb, &rt->dst);
3152 if (rtm->rtm_flags & RTM_F_NOTIFY)
3153 rt->rt_flags |= RTCF_NOTIFY;
3155 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3156 RTM_NEWROUTE, 0, 0);
3160 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3169 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3176 net = sock_net(skb->sk);
3181 s_idx = idx = cb->args[1];
3182 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3183 if (!rt_hash_table[h].chain)
3186 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3187 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3188 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3190 if (rt_is_expired(rt))
3192 skb_dst_set_noref(skb, &rt->dst);
3193 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3194 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3195 1, NLM_F_MULTI) <= 0) {
3197 rcu_read_unlock_bh();
3202 rcu_read_unlock_bh();
3211 void ip_rt_multicast_event(struct in_device *in_dev)
3213 rt_cache_flush(dev_net(in_dev->dev), 0);
3216 #ifdef CONFIG_SYSCTL
3217 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3218 void __user *buffer,
3219 size_t *lenp, loff_t *ppos)
3226 memcpy(&ctl, __ctl, sizeof(ctl));
3227 ctl.data = &flush_delay;
3228 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3230 net = (struct net *)__ctl->extra1;
3231 rt_cache_flush(net, flush_delay);
3238 static ctl_table ipv4_route_table[] = {
3240 .procname = "gc_thresh",
3241 .data = &ipv4_dst_ops.gc_thresh,
3242 .maxlen = sizeof(int),
3244 .proc_handler = proc_dointvec,
3247 .procname = "max_size",
3248 .data = &ip_rt_max_size,
3249 .maxlen = sizeof(int),
3251 .proc_handler = proc_dointvec,
3254 /* Deprecated. Use gc_min_interval_ms */
3256 .procname = "gc_min_interval",
3257 .data = &ip_rt_gc_min_interval,
3258 .maxlen = sizeof(int),
3260 .proc_handler = proc_dointvec_jiffies,
3263 .procname = "gc_min_interval_ms",
3264 .data = &ip_rt_gc_min_interval,
3265 .maxlen = sizeof(int),
3267 .proc_handler = proc_dointvec_ms_jiffies,
3270 .procname = "gc_timeout",
3271 .data = &ip_rt_gc_timeout,
3272 .maxlen = sizeof(int),
3274 .proc_handler = proc_dointvec_jiffies,
3277 .procname = "gc_interval",
3278 .data = &ip_rt_gc_interval,
3279 .maxlen = sizeof(int),
3281 .proc_handler = proc_dointvec_jiffies,
3284 .procname = "redirect_load",
3285 .data = &ip_rt_redirect_load,
3286 .maxlen = sizeof(int),
3288 .proc_handler = proc_dointvec,
3291 .procname = "redirect_number",
3292 .data = &ip_rt_redirect_number,
3293 .maxlen = sizeof(int),
3295 .proc_handler = proc_dointvec,
3298 .procname = "redirect_silence",
3299 .data = &ip_rt_redirect_silence,
3300 .maxlen = sizeof(int),
3302 .proc_handler = proc_dointvec,
3305 .procname = "error_cost",
3306 .data = &ip_rt_error_cost,
3307 .maxlen = sizeof(int),
3309 .proc_handler = proc_dointvec,
3312 .procname = "error_burst",
3313 .data = &ip_rt_error_burst,
3314 .maxlen = sizeof(int),
3316 .proc_handler = proc_dointvec,
3319 .procname = "gc_elasticity",
3320 .data = &ip_rt_gc_elasticity,
3321 .maxlen = sizeof(int),
3323 .proc_handler = proc_dointvec,
3326 .procname = "mtu_expires",
3327 .data = &ip_rt_mtu_expires,
3328 .maxlen = sizeof(int),
3330 .proc_handler = proc_dointvec_jiffies,
3333 .procname = "min_pmtu",
3334 .data = &ip_rt_min_pmtu,
3335 .maxlen = sizeof(int),
3337 .proc_handler = proc_dointvec,
3340 .procname = "min_adv_mss",
3341 .data = &ip_rt_min_advmss,
3342 .maxlen = sizeof(int),
3344 .proc_handler = proc_dointvec,
3349 static struct ctl_table empty[1];
3351 static struct ctl_table ipv4_skeleton[] =
3353 { .procname = "route",
3354 .mode = 0555, .child = ipv4_route_table},
3355 { .procname = "neigh",
3356 .mode = 0555, .child = empty},
3360 static __net_initdata struct ctl_path ipv4_path[] = {
3361 { .procname = "net", },
3362 { .procname = "ipv4", },
3366 static struct ctl_table ipv4_route_flush_table[] = {
3368 .procname = "flush",
3369 .maxlen = sizeof(int),
3371 .proc_handler = ipv4_sysctl_rtcache_flush,
3376 static __net_initdata struct ctl_path ipv4_route_path[] = {
3377 { .procname = "net", },
3378 { .procname = "ipv4", },
3379 { .procname = "route", },
3383 static __net_init int sysctl_route_net_init(struct net *net)
3385 struct ctl_table *tbl;
3387 tbl = ipv4_route_flush_table;
3388 if (!net_eq(net, &init_net)) {
3389 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3393 tbl[0].extra1 = net;
3395 net->ipv4.route_hdr =
3396 register_net_sysctl_table(net, ipv4_route_path, tbl);
3397 if (net->ipv4.route_hdr == NULL)
3402 if (tbl != ipv4_route_flush_table)
3408 static __net_exit void sysctl_route_net_exit(struct net *net)
3410 struct ctl_table *tbl;
3412 tbl = net->ipv4.route_hdr->ctl_table_arg;
3413 unregister_net_sysctl_table(net->ipv4.route_hdr);
3414 BUG_ON(tbl == ipv4_route_flush_table);
3418 static __net_initdata struct pernet_operations sysctl_route_ops = {
3419 .init = sysctl_route_net_init,
3420 .exit = sysctl_route_net_exit,
3424 static __net_init int rt_genid_init(struct net *net)
3426 get_random_bytes(&net->ipv4.rt_genid,
3427 sizeof(net->ipv4.rt_genid));
3428 get_random_bytes(&net->ipv4.dev_addr_genid,
3429 sizeof(net->ipv4.dev_addr_genid));
3433 static __net_initdata struct pernet_operations rt_genid_ops = {
3434 .init = rt_genid_init,
3438 #ifdef CONFIG_IP_ROUTE_CLASSID
3439 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3440 #endif /* CONFIG_IP_ROUTE_CLASSID */
3442 static __initdata unsigned long rhash_entries;
3443 static int __init set_rhash_entries(char *str)
3447 rhash_entries = simple_strtoul(str, &str, 0);
3450 __setup("rhash_entries=", set_rhash_entries);
3452 int __init ip_rt_init(void)
3456 #ifdef CONFIG_IP_ROUTE_CLASSID
3457 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3459 panic("IP: failed to allocate ip_rt_acct\n");
3462 ipv4_dst_ops.kmem_cachep =
3463 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3464 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3466 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3468 if (dst_entries_init(&ipv4_dst_ops) < 0)
3469 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3471 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3472 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3474 rt_hash_table = (struct rt_hash_bucket *)
3475 alloc_large_system_hash("IP route cache",
3476 sizeof(struct rt_hash_bucket),
3478 (totalram_pages >= 128 * 1024) ?
3483 rhash_entries ? 0 : 512 * 1024);
3484 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3485 rt_hash_lock_init();
3487 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3488 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3493 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3494 expires_ljiffies = jiffies;
3495 schedule_delayed_work(&expires_work,
3496 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3498 if (ip_rt_proc_init())
3499 printk(KERN_ERR "Unable to create route proc files\n");
3502 xfrm4_init(ip_rt_max_size);
3504 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3506 #ifdef CONFIG_SYSCTL
3507 register_pernet_subsys(&sysctl_route_ops);
3509 register_pernet_subsys(&rt_genid_ops);
3513 #ifdef CONFIG_SYSCTL
3515 * We really need to sanitize the damn ipv4 init order, then all
3516 * this nonsense will go away.
3518 void __init ip_static_sysctl_init(void)
3520 register_sysctl_paths(ipv4_path, ipv4_skeleton);