]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv4/route.c
net: Compute protocol sequence numbers and fragment IDs using MD5.
[karo-tx-linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/secure_seq.h>
112
113 #define RT_FL_TOS(oldflp4) \
114     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly    = 8;
130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly       = 256;
133 static int rt_chain_length_max __read_mostly    = 20;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void              ipv4_link_failure(struct sk_buff *skb);
145 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
146 static int rt_garbage_collect(struct dst_ops *ops);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         struct rtable *rt = (struct rtable *) dst;
156         struct inet_peer *peer;
157         u32 *p = NULL;
158
159         if (!rt->peer)
160                 rt_bind_peer(rt, rt->rt_dst, 1);
161
162         peer = rt->peer;
163         if (peer) {
164                 u32 *old_p = __DST_METRICS_PTR(old);
165                 unsigned long prev, new;
166
167                 p = peer->metrics;
168                 if (inet_metrics_new(peer))
169                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
170
171                 new = (unsigned long) p;
172                 prev = cmpxchg(&dst->_metrics, old, new);
173
174                 if (prev != old) {
175                         p = __DST_METRICS_PTR(prev);
176                         if (prev & DST_METRICS_READ_ONLY)
177                                 p = NULL;
178                 } else {
179                         if (rt->fi) {
180                                 fib_info_put(rt->fi);
181                                 rt->fi = NULL;
182                         }
183                 }
184         }
185         return p;
186 }
187
188 static struct dst_ops ipv4_dst_ops = {
189         .family =               AF_INET,
190         .protocol =             cpu_to_be16(ETH_P_IP),
191         .gc =                   rt_garbage_collect,
192         .check =                ipv4_dst_check,
193         .default_advmss =       ipv4_default_advmss,
194         .default_mtu =          ipv4_default_mtu,
195         .cow_metrics =          ipv4_cow_metrics,
196         .destroy =              ipv4_dst_destroy,
197         .ifdown =               ipv4_dst_ifdown,
198         .negative_advice =      ipv4_negative_advice,
199         .link_failure =         ipv4_link_failure,
200         .update_pmtu =          ip_rt_update_pmtu,
201         .local_out =            __ip_local_out,
202 };
203
204 #define ECN_OR_COST(class)      TC_PRIO_##class
205
206 const __u8 ip_tos2prio[16] = {
207         TC_PRIO_BESTEFFORT,
208         ECN_OR_COST(BESTEFFORT),
209         TC_PRIO_BESTEFFORT,
210         ECN_OR_COST(BESTEFFORT),
211         TC_PRIO_BULK,
212         ECN_OR_COST(BULK),
213         TC_PRIO_BULK,
214         ECN_OR_COST(BULK),
215         TC_PRIO_INTERACTIVE,
216         ECN_OR_COST(INTERACTIVE),
217         TC_PRIO_INTERACTIVE,
218         ECN_OR_COST(INTERACTIVE),
219         TC_PRIO_INTERACTIVE_BULK,
220         ECN_OR_COST(INTERACTIVE_BULK),
221         TC_PRIO_INTERACTIVE_BULK,
222         ECN_OR_COST(INTERACTIVE_BULK)
223 };
224
225
226 /*
227  * Route cache.
228  */
229
230 /* The locking scheme is rather straight forward:
231  *
232  * 1) Read-Copy Update protects the buckets of the central route hash.
233  * 2) Only writers remove entries, and they hold the lock
234  *    as they look at rtable reference counts.
235  * 3) Only readers acquire references to rtable entries,
236  *    they do so with atomic increments and with the
237  *    lock held.
238  */
239
240 struct rt_hash_bucket {
241         struct rtable __rcu     *chain;
242 };
243
244 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
245         defined(CONFIG_PROVE_LOCKING)
246 /*
247  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
248  * The size of this table is a power of two and depends on the number of CPUS.
249  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
250  */
251 #ifdef CONFIG_LOCKDEP
252 # define RT_HASH_LOCK_SZ        256
253 #else
254 # if NR_CPUS >= 32
255 #  define RT_HASH_LOCK_SZ       4096
256 # elif NR_CPUS >= 16
257 #  define RT_HASH_LOCK_SZ       2048
258 # elif NR_CPUS >= 8
259 #  define RT_HASH_LOCK_SZ       1024
260 # elif NR_CPUS >= 4
261 #  define RT_HASH_LOCK_SZ       512
262 # else
263 #  define RT_HASH_LOCK_SZ       256
264 # endif
265 #endif
266
267 static spinlock_t       *rt_hash_locks;
268 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
269
270 static __init void rt_hash_lock_init(void)
271 {
272         int i;
273
274         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
275                         GFP_KERNEL);
276         if (!rt_hash_locks)
277                 panic("IP: failed to allocate rt_hash_locks\n");
278
279         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
280                 spin_lock_init(&rt_hash_locks[i]);
281 }
282 #else
283 # define rt_hash_lock_addr(slot) NULL
284
285 static inline void rt_hash_lock_init(void)
286 {
287 }
288 #endif
289
290 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
291 static unsigned                 rt_hash_mask __read_mostly;
292 static unsigned int             rt_hash_log  __read_mostly;
293
294 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
295 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
296
297 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
298                                    int genid)
299 {
300         return jhash_3words((__force u32)daddr, (__force u32)saddr,
301                             idx, genid)
302                 & rt_hash_mask;
303 }
304
305 static inline int rt_genid(struct net *net)
306 {
307         return atomic_read(&net->ipv4.rt_genid);
308 }
309
310 #ifdef CONFIG_PROC_FS
311 struct rt_cache_iter_state {
312         struct seq_net_private p;
313         int bucket;
314         int genid;
315 };
316
317 static struct rtable *rt_cache_get_first(struct seq_file *seq)
318 {
319         struct rt_cache_iter_state *st = seq->private;
320         struct rtable *r = NULL;
321
322         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
323                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
324                         continue;
325                 rcu_read_lock_bh();
326                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
327                 while (r) {
328                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
329                             r->rt_genid == st->genid)
330                                 return r;
331                         r = rcu_dereference_bh(r->dst.rt_next);
332                 }
333                 rcu_read_unlock_bh();
334         }
335         return r;
336 }
337
338 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
339                                           struct rtable *r)
340 {
341         struct rt_cache_iter_state *st = seq->private;
342
343         r = rcu_dereference_bh(r->dst.rt_next);
344         while (!r) {
345                 rcu_read_unlock_bh();
346                 do {
347                         if (--st->bucket < 0)
348                                 return NULL;
349                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
350                 rcu_read_lock_bh();
351                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
352         }
353         return r;
354 }
355
356 static struct rtable *rt_cache_get_next(struct seq_file *seq,
357                                         struct rtable *r)
358 {
359         struct rt_cache_iter_state *st = seq->private;
360         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
361                 if (dev_net(r->dst.dev) != seq_file_net(seq))
362                         continue;
363                 if (r->rt_genid == st->genid)
364                         break;
365         }
366         return r;
367 }
368
369 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
370 {
371         struct rtable *r = rt_cache_get_first(seq);
372
373         if (r)
374                 while (pos && (r = rt_cache_get_next(seq, r)))
375                         --pos;
376         return pos ? NULL : r;
377 }
378
379 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
380 {
381         struct rt_cache_iter_state *st = seq->private;
382         if (*pos)
383                 return rt_cache_get_idx(seq, *pos - 1);
384         st->genid = rt_genid(seq_file_net(seq));
385         return SEQ_START_TOKEN;
386 }
387
388 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
389 {
390         struct rtable *r;
391
392         if (v == SEQ_START_TOKEN)
393                 r = rt_cache_get_first(seq);
394         else
395                 r = rt_cache_get_next(seq, v);
396         ++*pos;
397         return r;
398 }
399
400 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
401 {
402         if (v && v != SEQ_START_TOKEN)
403                 rcu_read_unlock_bh();
404 }
405
406 static int rt_cache_seq_show(struct seq_file *seq, void *v)
407 {
408         if (v == SEQ_START_TOKEN)
409                 seq_printf(seq, "%-127s\n",
410                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
411                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
412                            "HHUptod\tSpecDst");
413         else {
414                 struct rtable *r = v;
415                 int len;
416
417                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
418                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
419                         r->dst.dev ? r->dst.dev->name : "*",
420                         (__force u32)r->rt_dst,
421                         (__force u32)r->rt_gateway,
422                         r->rt_flags, atomic_read(&r->dst.__refcnt),
423                         r->dst.__use, 0, (__force u32)r->rt_src,
424                         dst_metric_advmss(&r->dst) + 40,
425                         dst_metric(&r->dst, RTAX_WINDOW),
426                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
427                               dst_metric(&r->dst, RTAX_RTTVAR)),
428                         r->rt_key_tos,
429                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
430                         r->dst.hh ? (r->dst.hh->hh_output ==
431                                        dev_queue_xmit) : 0,
432                         r->rt_spec_dst, &len);
433
434                 seq_printf(seq, "%*s\n", 127 - len, "");
435         }
436         return 0;
437 }
438
439 static const struct seq_operations rt_cache_seq_ops = {
440         .start  = rt_cache_seq_start,
441         .next   = rt_cache_seq_next,
442         .stop   = rt_cache_seq_stop,
443         .show   = rt_cache_seq_show,
444 };
445
446 static int rt_cache_seq_open(struct inode *inode, struct file *file)
447 {
448         return seq_open_net(inode, file, &rt_cache_seq_ops,
449                         sizeof(struct rt_cache_iter_state));
450 }
451
452 static const struct file_operations rt_cache_seq_fops = {
453         .owner   = THIS_MODULE,
454         .open    = rt_cache_seq_open,
455         .read    = seq_read,
456         .llseek  = seq_lseek,
457         .release = seq_release_net,
458 };
459
460
461 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
462 {
463         int cpu;
464
465         if (*pos == 0)
466                 return SEQ_START_TOKEN;
467
468         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
469                 if (!cpu_possible(cpu))
470                         continue;
471                 *pos = cpu+1;
472                 return &per_cpu(rt_cache_stat, cpu);
473         }
474         return NULL;
475 }
476
477 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
478 {
479         int cpu;
480
481         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
482                 if (!cpu_possible(cpu))
483                         continue;
484                 *pos = cpu+1;
485                 return &per_cpu(rt_cache_stat, cpu);
486         }
487         return NULL;
488
489 }
490
491 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
492 {
493
494 }
495
496 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
497 {
498         struct rt_cache_stat *st = v;
499
500         if (v == SEQ_START_TOKEN) {
501                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
502                 return 0;
503         }
504
505         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
506                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
507                    dst_entries_get_slow(&ipv4_dst_ops),
508                    st->in_hit,
509                    st->in_slow_tot,
510                    st->in_slow_mc,
511                    st->in_no_route,
512                    st->in_brd,
513                    st->in_martian_dst,
514                    st->in_martian_src,
515
516                    st->out_hit,
517                    st->out_slow_tot,
518                    st->out_slow_mc,
519
520                    st->gc_total,
521                    st->gc_ignored,
522                    st->gc_goal_miss,
523                    st->gc_dst_overflow,
524                    st->in_hlist_search,
525                    st->out_hlist_search
526                 );
527         return 0;
528 }
529
530 static const struct seq_operations rt_cpu_seq_ops = {
531         .start  = rt_cpu_seq_start,
532         .next   = rt_cpu_seq_next,
533         .stop   = rt_cpu_seq_stop,
534         .show   = rt_cpu_seq_show,
535 };
536
537
538 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
539 {
540         return seq_open(file, &rt_cpu_seq_ops);
541 }
542
543 static const struct file_operations rt_cpu_seq_fops = {
544         .owner   = THIS_MODULE,
545         .open    = rt_cpu_seq_open,
546         .read    = seq_read,
547         .llseek  = seq_lseek,
548         .release = seq_release,
549 };
550
551 #ifdef CONFIG_IP_ROUTE_CLASSID
552 static int rt_acct_proc_show(struct seq_file *m, void *v)
553 {
554         struct ip_rt_acct *dst, *src;
555         unsigned int i, j;
556
557         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
558         if (!dst)
559                 return -ENOMEM;
560
561         for_each_possible_cpu(i) {
562                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
563                 for (j = 0; j < 256; j++) {
564                         dst[j].o_bytes   += src[j].o_bytes;
565                         dst[j].o_packets += src[j].o_packets;
566                         dst[j].i_bytes   += src[j].i_bytes;
567                         dst[j].i_packets += src[j].i_packets;
568                 }
569         }
570
571         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
572         kfree(dst);
573         return 0;
574 }
575
576 static int rt_acct_proc_open(struct inode *inode, struct file *file)
577 {
578         return single_open(file, rt_acct_proc_show, NULL);
579 }
580
581 static const struct file_operations rt_acct_proc_fops = {
582         .owner          = THIS_MODULE,
583         .open           = rt_acct_proc_open,
584         .read           = seq_read,
585         .llseek         = seq_lseek,
586         .release        = single_release,
587 };
588 #endif
589
590 static int __net_init ip_rt_do_proc_init(struct net *net)
591 {
592         struct proc_dir_entry *pde;
593
594         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
595                         &rt_cache_seq_fops);
596         if (!pde)
597                 goto err1;
598
599         pde = proc_create("rt_cache", S_IRUGO,
600                           net->proc_net_stat, &rt_cpu_seq_fops);
601         if (!pde)
602                 goto err2;
603
604 #ifdef CONFIG_IP_ROUTE_CLASSID
605         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
606         if (!pde)
607                 goto err3;
608 #endif
609         return 0;
610
611 #ifdef CONFIG_IP_ROUTE_CLASSID
612 err3:
613         remove_proc_entry("rt_cache", net->proc_net_stat);
614 #endif
615 err2:
616         remove_proc_entry("rt_cache", net->proc_net);
617 err1:
618         return -ENOMEM;
619 }
620
621 static void __net_exit ip_rt_do_proc_exit(struct net *net)
622 {
623         remove_proc_entry("rt_cache", net->proc_net_stat);
624         remove_proc_entry("rt_cache", net->proc_net);
625 #ifdef CONFIG_IP_ROUTE_CLASSID
626         remove_proc_entry("rt_acct", net->proc_net);
627 #endif
628 }
629
630 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
631         .init = ip_rt_do_proc_init,
632         .exit = ip_rt_do_proc_exit,
633 };
634
635 static int __init ip_rt_proc_init(void)
636 {
637         return register_pernet_subsys(&ip_rt_proc_ops);
638 }
639
640 #else
641 static inline int ip_rt_proc_init(void)
642 {
643         return 0;
644 }
645 #endif /* CONFIG_PROC_FS */
646
647 static inline void rt_free(struct rtable *rt)
648 {
649         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
650 }
651
652 static inline void rt_drop(struct rtable *rt)
653 {
654         ip_rt_put(rt);
655         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
656 }
657
658 static inline int rt_fast_clean(struct rtable *rth)
659 {
660         /* Kill broadcast/multicast entries very aggresively, if they
661            collide in hash table with more useful entries */
662         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
663                 rt_is_input_route(rth) && rth->dst.rt_next;
664 }
665
666 static inline int rt_valuable(struct rtable *rth)
667 {
668         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
669                 (rth->peer && rth->peer->pmtu_expires);
670 }
671
672 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
673 {
674         unsigned long age;
675         int ret = 0;
676
677         if (atomic_read(&rth->dst.__refcnt))
678                 goto out;
679
680         age = jiffies - rth->dst.lastuse;
681         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
682             (age <= tmo2 && rt_valuable(rth)))
683                 goto out;
684         ret = 1;
685 out:    return ret;
686 }
687
688 /* Bits of score are:
689  * 31: very valuable
690  * 30: not quite useless
691  * 29..0: usage counter
692  */
693 static inline u32 rt_score(struct rtable *rt)
694 {
695         u32 score = jiffies - rt->dst.lastuse;
696
697         score = ~score & ~(3<<30);
698
699         if (rt_valuable(rt))
700                 score |= (1<<31);
701
702         if (rt_is_output_route(rt) ||
703             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
704                 score |= (1<<30);
705
706         return score;
707 }
708
709 static inline bool rt_caching(const struct net *net)
710 {
711         return net->ipv4.current_rt_cache_rebuild_count <=
712                 net->ipv4.sysctl_rt_cache_rebuild_count;
713 }
714
715 static inline bool compare_hash_inputs(const struct rtable *rt1,
716                                        const struct rtable *rt2)
717 {
718         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
719                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
720                 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
721 }
722
723 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
724 {
725         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
726                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
727                 (rt1->rt_mark ^ rt2->rt_mark) |
728                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
729                 (rt1->rt_oif ^ rt2->rt_oif) |
730                 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
731 }
732
733 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
734 {
735         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
736 }
737
738 static inline int rt_is_expired(struct rtable *rth)
739 {
740         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
741 }
742
743 /*
744  * Perform a full scan of hash table and free all entries.
745  * Can be called by a softirq or a process.
746  * In the later case, we want to be reschedule if necessary
747  */
748 static void rt_do_flush(struct net *net, int process_context)
749 {
750         unsigned int i;
751         struct rtable *rth, *next;
752
753         for (i = 0; i <= rt_hash_mask; i++) {
754                 struct rtable __rcu **pprev;
755                 struct rtable *list;
756
757                 if (process_context && need_resched())
758                         cond_resched();
759                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
760                 if (!rth)
761                         continue;
762
763                 spin_lock_bh(rt_hash_lock_addr(i));
764
765                 list = NULL;
766                 pprev = &rt_hash_table[i].chain;
767                 rth = rcu_dereference_protected(*pprev,
768                         lockdep_is_held(rt_hash_lock_addr(i)));
769
770                 while (rth) {
771                         next = rcu_dereference_protected(rth->dst.rt_next,
772                                 lockdep_is_held(rt_hash_lock_addr(i)));
773
774                         if (!net ||
775                             net_eq(dev_net(rth->dst.dev), net)) {
776                                 rcu_assign_pointer(*pprev, next);
777                                 rcu_assign_pointer(rth->dst.rt_next, list);
778                                 list = rth;
779                         } else {
780                                 pprev = &rth->dst.rt_next;
781                         }
782                         rth = next;
783                 }
784
785                 spin_unlock_bh(rt_hash_lock_addr(i));
786
787                 for (; list; list = next) {
788                         next = rcu_dereference_protected(list->dst.rt_next, 1);
789                         rt_free(list);
790                 }
791         }
792 }
793
794 /*
795  * While freeing expired entries, we compute average chain length
796  * and standard deviation, using fixed-point arithmetic.
797  * This to have an estimation of rt_chain_length_max
798  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
799  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
800  */
801
802 #define FRACT_BITS 3
803 #define ONE (1UL << FRACT_BITS)
804
805 /*
806  * Given a hash chain and an item in this hash chain,
807  * find if a previous entry has the same hash_inputs
808  * (but differs on tos, mark or oif)
809  * Returns 0 if an alias is found.
810  * Returns ONE if rth has no alias before itself.
811  */
812 static int has_noalias(const struct rtable *head, const struct rtable *rth)
813 {
814         const struct rtable *aux = head;
815
816         while (aux != rth) {
817                 if (compare_hash_inputs(aux, rth))
818                         return 0;
819                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
820         }
821         return ONE;
822 }
823
824 /*
825  * Perturbation of rt_genid by a small quantity [1..256]
826  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
827  * many times (2^24) without giving recent rt_genid.
828  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
829  */
830 static void rt_cache_invalidate(struct net *net)
831 {
832         unsigned char shuffle;
833
834         get_random_bytes(&shuffle, sizeof(shuffle));
835         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
836 }
837
838 /*
839  * delay < 0  : invalidate cache (fast : entries will be deleted later)
840  * delay >= 0 : invalidate & flush cache (can be long)
841  */
842 void rt_cache_flush(struct net *net, int delay)
843 {
844         rt_cache_invalidate(net);
845         if (delay >= 0)
846                 rt_do_flush(net, !in_softirq());
847 }
848
849 /* Flush previous cache invalidated entries from the cache */
850 void rt_cache_flush_batch(struct net *net)
851 {
852         rt_do_flush(net, !in_softirq());
853 }
854
855 static void rt_emergency_hash_rebuild(struct net *net)
856 {
857         if (net_ratelimit())
858                 printk(KERN_WARNING "Route hash chain too long!\n");
859         rt_cache_invalidate(net);
860 }
861
862 /*
863    Short description of GC goals.
864
865    We want to build algorithm, which will keep routing cache
866    at some equilibrium point, when number of aged off entries
867    is kept approximately equal to newly generated ones.
868
869    Current expiration strength is variable "expire".
870    We try to adjust it dynamically, so that if networking
871    is idle expires is large enough to keep enough of warm entries,
872    and when load increases it reduces to limit cache size.
873  */
874
875 static int rt_garbage_collect(struct dst_ops *ops)
876 {
877         static unsigned long expire = RT_GC_TIMEOUT;
878         static unsigned long last_gc;
879         static int rover;
880         static int equilibrium;
881         struct rtable *rth;
882         struct rtable __rcu **rthp;
883         unsigned long now = jiffies;
884         int goal;
885         int entries = dst_entries_get_fast(&ipv4_dst_ops);
886
887         /*
888          * Garbage collection is pretty expensive,
889          * do not make it too frequently.
890          */
891
892         RT_CACHE_STAT_INC(gc_total);
893
894         if (now - last_gc < ip_rt_gc_min_interval &&
895             entries < ip_rt_max_size) {
896                 RT_CACHE_STAT_INC(gc_ignored);
897                 goto out;
898         }
899
900         entries = dst_entries_get_slow(&ipv4_dst_ops);
901         /* Calculate number of entries, which we want to expire now. */
902         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
903         if (goal <= 0) {
904                 if (equilibrium < ipv4_dst_ops.gc_thresh)
905                         equilibrium = ipv4_dst_ops.gc_thresh;
906                 goal = entries - equilibrium;
907                 if (goal > 0) {
908                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
909                         goal = entries - equilibrium;
910                 }
911         } else {
912                 /* We are in dangerous area. Try to reduce cache really
913                  * aggressively.
914                  */
915                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
916                 equilibrium = entries - goal;
917         }
918
919         if (now - last_gc >= ip_rt_gc_min_interval)
920                 last_gc = now;
921
922         if (goal <= 0) {
923                 equilibrium += goal;
924                 goto work_done;
925         }
926
927         do {
928                 int i, k;
929
930                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
931                         unsigned long tmo = expire;
932
933                         k = (k + 1) & rt_hash_mask;
934                         rthp = &rt_hash_table[k].chain;
935                         spin_lock_bh(rt_hash_lock_addr(k));
936                         while ((rth = rcu_dereference_protected(*rthp,
937                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
938                                 if (!rt_is_expired(rth) &&
939                                         !rt_may_expire(rth, tmo, expire)) {
940                                         tmo >>= 1;
941                                         rthp = &rth->dst.rt_next;
942                                         continue;
943                                 }
944                                 *rthp = rth->dst.rt_next;
945                                 rt_free(rth);
946                                 goal--;
947                         }
948                         spin_unlock_bh(rt_hash_lock_addr(k));
949                         if (goal <= 0)
950                                 break;
951                 }
952                 rover = k;
953
954                 if (goal <= 0)
955                         goto work_done;
956
957                 /* Goal is not achieved. We stop process if:
958
959                    - if expire reduced to zero. Otherwise, expire is halfed.
960                    - if table is not full.
961                    - if we are called from interrupt.
962                    - jiffies check is just fallback/debug loop breaker.
963                      We will not spin here for long time in any case.
964                  */
965
966                 RT_CACHE_STAT_INC(gc_goal_miss);
967
968                 if (expire == 0)
969                         break;
970
971                 expire >>= 1;
972
973                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
974                         goto out;
975         } while (!in_softirq() && time_before_eq(jiffies, now));
976
977         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
978                 goto out;
979         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
980                 goto out;
981         if (net_ratelimit())
982                 printk(KERN_WARNING "dst cache overflow\n");
983         RT_CACHE_STAT_INC(gc_dst_overflow);
984         return 1;
985
986 work_done:
987         expire += ip_rt_gc_min_interval;
988         if (expire > ip_rt_gc_timeout ||
989             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
990             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
991                 expire = ip_rt_gc_timeout;
992 out:    return 0;
993 }
994
995 /*
996  * Returns number of entries in a hash chain that have different hash_inputs
997  */
998 static int slow_chain_length(const struct rtable *head)
999 {
1000         int length = 0;
1001         const struct rtable *rth = head;
1002
1003         while (rth) {
1004                 length += has_noalias(head, rth);
1005                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1006         }
1007         return length >> FRACT_BITS;
1008 }
1009
1010 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1011                                      struct sk_buff *skb, int ifindex)
1012 {
1013         struct rtable   *rth, *cand;
1014         struct rtable __rcu **rthp, **candp;
1015         unsigned long   now;
1016         u32             min_score;
1017         int             chain_length;
1018         int attempts = !in_softirq();
1019
1020 restart:
1021         chain_length = 0;
1022         min_score = ~(u32)0;
1023         cand = NULL;
1024         candp = NULL;
1025         now = jiffies;
1026
1027         if (!rt_caching(dev_net(rt->dst.dev))) {
1028                 /*
1029                  * If we're not caching, just tell the caller we
1030                  * were successful and don't touch the route.  The
1031                  * caller hold the sole reference to the cache entry, and
1032                  * it will be released when the caller is done with it.
1033                  * If we drop it here, the callers have no way to resolve routes
1034                  * when we're not caching.  Instead, just point *rp at rt, so
1035                  * the caller gets a single use out of the route
1036                  * Note that we do rt_free on this new route entry, so that
1037                  * once its refcount hits zero, we are still able to reap it
1038                  * (Thanks Alexey)
1039                  * Note: To avoid expensive rcu stuff for this uncached dst,
1040                  * we set DST_NOCACHE so that dst_release() can free dst without
1041                  * waiting a grace period.
1042                  */
1043
1044                 rt->dst.flags |= DST_NOCACHE;
1045                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1046                         int err = arp_bind_neighbour(&rt->dst);
1047                         if (err) {
1048                                 if (net_ratelimit())
1049                                         printk(KERN_WARNING
1050                                             "Neighbour table failure & not caching routes.\n");
1051                                 ip_rt_put(rt);
1052                                 return ERR_PTR(err);
1053                         }
1054                 }
1055
1056                 goto skip_hashing;
1057         }
1058
1059         rthp = &rt_hash_table[hash].chain;
1060
1061         spin_lock_bh(rt_hash_lock_addr(hash));
1062         while ((rth = rcu_dereference_protected(*rthp,
1063                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1064                 if (rt_is_expired(rth)) {
1065                         *rthp = rth->dst.rt_next;
1066                         rt_free(rth);
1067                         continue;
1068                 }
1069                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1070                         /* Put it first */
1071                         *rthp = rth->dst.rt_next;
1072                         /*
1073                          * Since lookup is lockfree, the deletion
1074                          * must be visible to another weakly ordered CPU before
1075                          * the insertion at the start of the hash chain.
1076                          */
1077                         rcu_assign_pointer(rth->dst.rt_next,
1078                                            rt_hash_table[hash].chain);
1079                         /*
1080                          * Since lookup is lockfree, the update writes
1081                          * must be ordered for consistency on SMP.
1082                          */
1083                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1084
1085                         dst_use(&rth->dst, now);
1086                         spin_unlock_bh(rt_hash_lock_addr(hash));
1087
1088                         rt_drop(rt);
1089                         if (skb)
1090                                 skb_dst_set(skb, &rth->dst);
1091                         return rth;
1092                 }
1093
1094                 if (!atomic_read(&rth->dst.__refcnt)) {
1095                         u32 score = rt_score(rth);
1096
1097                         if (score <= min_score) {
1098                                 cand = rth;
1099                                 candp = rthp;
1100                                 min_score = score;
1101                         }
1102                 }
1103
1104                 chain_length++;
1105
1106                 rthp = &rth->dst.rt_next;
1107         }
1108
1109         if (cand) {
1110                 /* ip_rt_gc_elasticity used to be average length of chain
1111                  * length, when exceeded gc becomes really aggressive.
1112                  *
1113                  * The second limit is less certain. At the moment it allows
1114                  * only 2 entries per bucket. We will see.
1115                  */
1116                 if (chain_length > ip_rt_gc_elasticity) {
1117                         *candp = cand->dst.rt_next;
1118                         rt_free(cand);
1119                 }
1120         } else {
1121                 if (chain_length > rt_chain_length_max &&
1122                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1123                         struct net *net = dev_net(rt->dst.dev);
1124                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1125                         if (!rt_caching(net)) {
1126                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1127                                         rt->dst.dev->name, num);
1128                         }
1129                         rt_emergency_hash_rebuild(net);
1130                         spin_unlock_bh(rt_hash_lock_addr(hash));
1131
1132                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1133                                         ifindex, rt_genid(net));
1134                         goto restart;
1135                 }
1136         }
1137
1138         /* Try to bind route to arp only if it is output
1139            route or unicast forwarding path.
1140          */
1141         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1142                 int err = arp_bind_neighbour(&rt->dst);
1143                 if (err) {
1144                         spin_unlock_bh(rt_hash_lock_addr(hash));
1145
1146                         if (err != -ENOBUFS) {
1147                                 rt_drop(rt);
1148                                 return ERR_PTR(err);
1149                         }
1150
1151                         /* Neighbour tables are full and nothing
1152                            can be released. Try to shrink route cache,
1153                            it is most likely it holds some neighbour records.
1154                          */
1155                         if (attempts-- > 0) {
1156                                 int saved_elasticity = ip_rt_gc_elasticity;
1157                                 int saved_int = ip_rt_gc_min_interval;
1158                                 ip_rt_gc_elasticity     = 1;
1159                                 ip_rt_gc_min_interval   = 0;
1160                                 rt_garbage_collect(&ipv4_dst_ops);
1161                                 ip_rt_gc_min_interval   = saved_int;
1162                                 ip_rt_gc_elasticity     = saved_elasticity;
1163                                 goto restart;
1164                         }
1165
1166                         if (net_ratelimit())
1167                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1168                         rt_drop(rt);
1169                         return ERR_PTR(-ENOBUFS);
1170                 }
1171         }
1172
1173         rt->dst.rt_next = rt_hash_table[hash].chain;
1174
1175         /*
1176          * Since lookup is lockfree, we must make sure
1177          * previous writes to rt are committed to memory
1178          * before making rt visible to other CPUS.
1179          */
1180         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1181
1182         spin_unlock_bh(rt_hash_lock_addr(hash));
1183
1184 skip_hashing:
1185         if (skb)
1186                 skb_dst_set(skb, &rt->dst);
1187         return rt;
1188 }
1189
1190 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1191
1192 static u32 rt_peer_genid(void)
1193 {
1194         return atomic_read(&__rt_peer_genid);
1195 }
1196
1197 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1198 {
1199         struct inet_peer *peer;
1200
1201         peer = inet_getpeer_v4(daddr, create);
1202
1203         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1204                 inet_putpeer(peer);
1205         else
1206                 rt->rt_peer_genid = rt_peer_genid();
1207 }
1208
1209 /*
1210  * Peer allocation may fail only in serious out-of-memory conditions.  However
1211  * we still can generate some output.
1212  * Random ID selection looks a bit dangerous because we have no chances to
1213  * select ID being unique in a reasonable period of time.
1214  * But broken packet identifier may be better than no packet at all.
1215  */
1216 static void ip_select_fb_ident(struct iphdr *iph)
1217 {
1218         static DEFINE_SPINLOCK(ip_fb_id_lock);
1219         static u32 ip_fallback_id;
1220         u32 salt;
1221
1222         spin_lock_bh(&ip_fb_id_lock);
1223         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1224         iph->id = htons(salt & 0xFFFF);
1225         ip_fallback_id = salt;
1226         spin_unlock_bh(&ip_fb_id_lock);
1227 }
1228
1229 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1230 {
1231         struct rtable *rt = (struct rtable *) dst;
1232
1233         if (rt) {
1234                 if (rt->peer == NULL)
1235                         rt_bind_peer(rt, rt->rt_dst, 1);
1236
1237                 /* If peer is attached to destination, it is never detached,
1238                    so that we need not to grab a lock to dereference it.
1239                  */
1240                 if (rt->peer) {
1241                         iph->id = htons(inet_getid(rt->peer, more));
1242                         return;
1243                 }
1244         } else
1245                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1246                        __builtin_return_address(0));
1247
1248         ip_select_fb_ident(iph);
1249 }
1250 EXPORT_SYMBOL(__ip_select_ident);
1251
1252 static void rt_del(unsigned hash, struct rtable *rt)
1253 {
1254         struct rtable __rcu **rthp;
1255         struct rtable *aux;
1256
1257         rthp = &rt_hash_table[hash].chain;
1258         spin_lock_bh(rt_hash_lock_addr(hash));
1259         ip_rt_put(rt);
1260         while ((aux = rcu_dereference_protected(*rthp,
1261                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1262                 if (aux == rt || rt_is_expired(aux)) {
1263                         *rthp = aux->dst.rt_next;
1264                         rt_free(aux);
1265                         continue;
1266                 }
1267                 rthp = &aux->dst.rt_next;
1268         }
1269         spin_unlock_bh(rt_hash_lock_addr(hash));
1270 }
1271
1272 /* called in rcu_read_lock() section */
1273 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1274                     __be32 saddr, struct net_device *dev)
1275 {
1276         struct in_device *in_dev = __in_dev_get_rcu(dev);
1277         struct inet_peer *peer;
1278         struct net *net;
1279
1280         if (!in_dev)
1281                 return;
1282
1283         net = dev_net(dev);
1284         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1285             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1286             ipv4_is_zeronet(new_gw))
1287                 goto reject_redirect;
1288
1289         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1290                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1291                         goto reject_redirect;
1292                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1293                         goto reject_redirect;
1294         } else {
1295                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1296                         goto reject_redirect;
1297         }
1298
1299         peer = inet_getpeer_v4(daddr, 1);
1300         if (peer) {
1301                 peer->redirect_learned.a4 = new_gw;
1302
1303                 inet_putpeer(peer);
1304
1305                 atomic_inc(&__rt_peer_genid);
1306         }
1307         return;
1308
1309 reject_redirect:
1310 #ifdef CONFIG_IP_ROUTE_VERBOSE
1311         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1312                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1313                         "  Advised path = %pI4 -> %pI4\n",
1314                        &old_gw, dev->name, &new_gw,
1315                        &saddr, &daddr);
1316 #endif
1317         ;
1318 }
1319
1320 static bool peer_pmtu_expired(struct inet_peer *peer)
1321 {
1322         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1323
1324         return orig &&
1325                time_after_eq(jiffies, orig) &&
1326                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1327 }
1328
1329 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1330 {
1331         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1332
1333         return orig &&
1334                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1335 }
1336
1337 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1338 {
1339         struct rtable *rt = (struct rtable *)dst;
1340         struct dst_entry *ret = dst;
1341
1342         if (rt) {
1343                 if (dst->obsolete > 0) {
1344                         ip_rt_put(rt);
1345                         ret = NULL;
1346                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1347                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1348                                                 rt->rt_oif,
1349                                                 rt_genid(dev_net(dst->dev)));
1350                         rt_del(hash, rt);
1351                         ret = NULL;
1352                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1353                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1354                 }
1355         }
1356         return ret;
1357 }
1358
1359 /*
1360  * Algorithm:
1361  *      1. The first ip_rt_redirect_number redirects are sent
1362  *         with exponential backoff, then we stop sending them at all,
1363  *         assuming that the host ignores our redirects.
1364  *      2. If we did not see packets requiring redirects
1365  *         during ip_rt_redirect_silence, we assume that the host
1366  *         forgot redirected route and start to send redirects again.
1367  *
1368  * This algorithm is much cheaper and more intelligent than dumb load limiting
1369  * in icmp.c.
1370  *
1371  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1372  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1373  */
1374
1375 void ip_rt_send_redirect(struct sk_buff *skb)
1376 {
1377         struct rtable *rt = skb_rtable(skb);
1378         struct in_device *in_dev;
1379         struct inet_peer *peer;
1380         int log_martians;
1381
1382         rcu_read_lock();
1383         in_dev = __in_dev_get_rcu(rt->dst.dev);
1384         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1385                 rcu_read_unlock();
1386                 return;
1387         }
1388         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1389         rcu_read_unlock();
1390
1391         if (!rt->peer)
1392                 rt_bind_peer(rt, rt->rt_dst, 1);
1393         peer = rt->peer;
1394         if (!peer) {
1395                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1396                 return;
1397         }
1398
1399         /* No redirected packets during ip_rt_redirect_silence;
1400          * reset the algorithm.
1401          */
1402         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1403                 peer->rate_tokens = 0;
1404
1405         /* Too many ignored redirects; do not send anything
1406          * set dst.rate_last to the last seen redirected packet.
1407          */
1408         if (peer->rate_tokens >= ip_rt_redirect_number) {
1409                 peer->rate_last = jiffies;
1410                 return;
1411         }
1412
1413         /* Check for load limit; set rate_last to the latest sent
1414          * redirect.
1415          */
1416         if (peer->rate_tokens == 0 ||
1417             time_after(jiffies,
1418                        (peer->rate_last +
1419                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1420                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1421                 peer->rate_last = jiffies;
1422                 ++peer->rate_tokens;
1423 #ifdef CONFIG_IP_ROUTE_VERBOSE
1424                 if (log_martians &&
1425                     peer->rate_tokens == ip_rt_redirect_number &&
1426                     net_ratelimit())
1427                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1428                                &ip_hdr(skb)->saddr, rt->rt_iif,
1429                                 &rt->rt_dst, &rt->rt_gateway);
1430 #endif
1431         }
1432 }
1433
1434 static int ip_error(struct sk_buff *skb)
1435 {
1436         struct rtable *rt = skb_rtable(skb);
1437         struct inet_peer *peer;
1438         unsigned long now;
1439         bool send;
1440         int code;
1441
1442         switch (rt->dst.error) {
1443                 case EINVAL:
1444                 default:
1445                         goto out;
1446                 case EHOSTUNREACH:
1447                         code = ICMP_HOST_UNREACH;
1448                         break;
1449                 case ENETUNREACH:
1450                         code = ICMP_NET_UNREACH;
1451                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1452                                         IPSTATS_MIB_INNOROUTES);
1453                         break;
1454                 case EACCES:
1455                         code = ICMP_PKT_FILTERED;
1456                         break;
1457         }
1458
1459         if (!rt->peer)
1460                 rt_bind_peer(rt, rt->rt_dst, 1);
1461         peer = rt->peer;
1462
1463         send = true;
1464         if (peer) {
1465                 now = jiffies;
1466                 peer->rate_tokens += now - peer->rate_last;
1467                 if (peer->rate_tokens > ip_rt_error_burst)
1468                         peer->rate_tokens = ip_rt_error_burst;
1469                 peer->rate_last = now;
1470                 if (peer->rate_tokens >= ip_rt_error_cost)
1471                         peer->rate_tokens -= ip_rt_error_cost;
1472                 else
1473                         send = false;
1474         }
1475         if (send)
1476                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1477
1478 out:    kfree_skb(skb);
1479         return 0;
1480 }
1481
1482 /*
1483  *      The last two values are not from the RFC but
1484  *      are needed for AMPRnet AX.25 paths.
1485  */
1486
1487 static const unsigned short mtu_plateau[] =
1488 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1489
1490 static inline unsigned short guess_mtu(unsigned short old_mtu)
1491 {
1492         int i;
1493
1494         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1495                 if (old_mtu > mtu_plateau[i])
1496                         return mtu_plateau[i];
1497         return 68;
1498 }
1499
1500 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1501                                  unsigned short new_mtu,
1502                                  struct net_device *dev)
1503 {
1504         unsigned short old_mtu = ntohs(iph->tot_len);
1505         unsigned short est_mtu = 0;
1506         struct inet_peer *peer;
1507
1508         peer = inet_getpeer_v4(iph->daddr, 1);
1509         if (peer) {
1510                 unsigned short mtu = new_mtu;
1511
1512                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1513                         /* BSD 4.2 derived systems incorrectly adjust
1514                          * tot_len by the IP header length, and report
1515                          * a zero MTU in the ICMP message.
1516                          */
1517                         if (mtu == 0 &&
1518                             old_mtu >= 68 + (iph->ihl << 2))
1519                                 old_mtu -= iph->ihl << 2;
1520                         mtu = guess_mtu(old_mtu);
1521                 }
1522
1523                 if (mtu < ip_rt_min_pmtu)
1524                         mtu = ip_rt_min_pmtu;
1525                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1526                         unsigned long pmtu_expires;
1527
1528                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1529                         if (!pmtu_expires)
1530                                 pmtu_expires = 1UL;
1531
1532                         est_mtu = mtu;
1533                         peer->pmtu_learned = mtu;
1534                         peer->pmtu_expires = pmtu_expires;
1535                 }
1536
1537                 inet_putpeer(peer);
1538
1539                 atomic_inc(&__rt_peer_genid);
1540         }
1541         return est_mtu ? : new_mtu;
1542 }
1543
1544 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1545 {
1546         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1547
1548         if (!expires)
1549                 return;
1550         if (time_before(jiffies, expires)) {
1551                 u32 orig_dst_mtu = dst_mtu(dst);
1552                 if (peer->pmtu_learned < orig_dst_mtu) {
1553                         if (!peer->pmtu_orig)
1554                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1555                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1556                 }
1557         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1558                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1559 }
1560
1561 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1562 {
1563         struct rtable *rt = (struct rtable *) dst;
1564         struct inet_peer *peer;
1565
1566         dst_confirm(dst);
1567
1568         if (!rt->peer)
1569                 rt_bind_peer(rt, rt->rt_dst, 1);
1570         peer = rt->peer;
1571         if (peer) {
1572                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1573
1574                 if (mtu < ip_rt_min_pmtu)
1575                         mtu = ip_rt_min_pmtu;
1576                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1577
1578                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1579                         if (!pmtu_expires)
1580                                 pmtu_expires = 1UL;
1581
1582                         peer->pmtu_learned = mtu;
1583                         peer->pmtu_expires = pmtu_expires;
1584
1585                         atomic_inc(&__rt_peer_genid);
1586                         rt->rt_peer_genid = rt_peer_genid();
1587                 }
1588                 check_peer_pmtu(dst, peer);
1589         }
1590 }
1591
1592 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1593 {
1594         struct rtable *rt = (struct rtable *) dst;
1595         __be32 orig_gw = rt->rt_gateway;
1596
1597         dst_confirm(&rt->dst);
1598
1599         neigh_release(rt->dst.neighbour);
1600         rt->dst.neighbour = NULL;
1601
1602         rt->rt_gateway = peer->redirect_learned.a4;
1603         if (arp_bind_neighbour(&rt->dst) ||
1604             !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1605                 if (rt->dst.neighbour)
1606                         neigh_event_send(rt->dst.neighbour, NULL);
1607                 rt->rt_gateway = orig_gw;
1608                 return -EAGAIN;
1609         } else {
1610                 rt->rt_flags |= RTCF_REDIRECTED;
1611                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1612                                         rt->dst.neighbour);
1613         }
1614         return 0;
1615 }
1616
1617 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1618 {
1619         struct rtable *rt = (struct rtable *) dst;
1620
1621         if (rt_is_expired(rt))
1622                 return NULL;
1623         if (rt->rt_peer_genid != rt_peer_genid()) {
1624                 struct inet_peer *peer;
1625
1626                 if (!rt->peer)
1627                         rt_bind_peer(rt, rt->rt_dst, 0);
1628
1629                 peer = rt->peer;
1630                 if (peer) {
1631                         check_peer_pmtu(dst, peer);
1632
1633                         if (peer->redirect_learned.a4 &&
1634                             peer->redirect_learned.a4 != rt->rt_gateway) {
1635                                 if (check_peer_redir(dst, peer))
1636                                         return NULL;
1637                         }
1638                 }
1639
1640                 rt->rt_peer_genid = rt_peer_genid();
1641         }
1642         return dst;
1643 }
1644
1645 static void ipv4_dst_destroy(struct dst_entry *dst)
1646 {
1647         struct rtable *rt = (struct rtable *) dst;
1648         struct inet_peer *peer = rt->peer;
1649
1650         if (rt->fi) {
1651                 fib_info_put(rt->fi);
1652                 rt->fi = NULL;
1653         }
1654         if (peer) {
1655                 rt->peer = NULL;
1656                 inet_putpeer(peer);
1657         }
1658 }
1659
1660
1661 static void ipv4_link_failure(struct sk_buff *skb)
1662 {
1663         struct rtable *rt;
1664
1665         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1666
1667         rt = skb_rtable(skb);
1668         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1669                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1670 }
1671
1672 static int ip_rt_bug(struct sk_buff *skb)
1673 {
1674         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1675                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1676                 skb->dev ? skb->dev->name : "?");
1677         kfree_skb(skb);
1678         WARN_ON(1);
1679         return 0;
1680 }
1681
1682 /*
1683    We do not cache source address of outgoing interface,
1684    because it is used only by IP RR, TS and SRR options,
1685    so that it out of fast path.
1686
1687    BTW remember: "addr" is allowed to be not aligned
1688    in IP options!
1689  */
1690
1691 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1692 {
1693         __be32 src;
1694
1695         if (rt_is_output_route(rt))
1696                 src = ip_hdr(skb)->saddr;
1697         else {
1698                 struct fib_result res;
1699                 struct flowi4 fl4;
1700                 struct iphdr *iph;
1701
1702                 iph = ip_hdr(skb);
1703
1704                 memset(&fl4, 0, sizeof(fl4));
1705                 fl4.daddr = iph->daddr;
1706                 fl4.saddr = iph->saddr;
1707                 fl4.flowi4_tos = iph->tos;
1708                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1709                 fl4.flowi4_iif = skb->dev->ifindex;
1710                 fl4.flowi4_mark = skb->mark;
1711
1712                 rcu_read_lock();
1713                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1714                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1715                 else
1716                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1717                                         RT_SCOPE_UNIVERSE);
1718                 rcu_read_unlock();
1719         }
1720         memcpy(addr, &src, 4);
1721 }
1722
1723 #ifdef CONFIG_IP_ROUTE_CLASSID
1724 static void set_class_tag(struct rtable *rt, u32 tag)
1725 {
1726         if (!(rt->dst.tclassid & 0xFFFF))
1727                 rt->dst.tclassid |= tag & 0xFFFF;
1728         if (!(rt->dst.tclassid & 0xFFFF0000))
1729                 rt->dst.tclassid |= tag & 0xFFFF0000;
1730 }
1731 #endif
1732
1733 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1734 {
1735         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1736
1737         if (advmss == 0) {
1738                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1739                                ip_rt_min_advmss);
1740                 if (advmss > 65535 - 40)
1741                         advmss = 65535 - 40;
1742         }
1743         return advmss;
1744 }
1745
1746 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1747 {
1748         unsigned int mtu = dst->dev->mtu;
1749
1750         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1751                 const struct rtable *rt = (const struct rtable *) dst;
1752
1753                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1754                         mtu = 576;
1755         }
1756
1757         if (mtu > IP_MAX_MTU)
1758                 mtu = IP_MAX_MTU;
1759
1760         return mtu;
1761 }
1762
1763 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1764                             struct fib_info *fi)
1765 {
1766         struct inet_peer *peer;
1767         int create = 0;
1768
1769         /* If a peer entry exists for this destination, we must hook
1770          * it up in order to get at cached metrics.
1771          */
1772         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1773                 create = 1;
1774
1775         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1776         if (peer) {
1777                 rt->rt_peer_genid = rt_peer_genid();
1778                 if (inet_metrics_new(peer))
1779                         memcpy(peer->metrics, fi->fib_metrics,
1780                                sizeof(u32) * RTAX_MAX);
1781                 dst_init_metrics(&rt->dst, peer->metrics, false);
1782
1783                 check_peer_pmtu(&rt->dst, peer);
1784                 if (peer->redirect_learned.a4 &&
1785                     peer->redirect_learned.a4 != rt->rt_gateway) {
1786                         rt->rt_gateway = peer->redirect_learned.a4;
1787                         rt->rt_flags |= RTCF_REDIRECTED;
1788                 }
1789         } else {
1790                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1791                         rt->fi = fi;
1792                         atomic_inc(&fi->fib_clntref);
1793                 }
1794                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1795         }
1796 }
1797
1798 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1799                            const struct fib_result *res,
1800                            struct fib_info *fi, u16 type, u32 itag)
1801 {
1802         struct dst_entry *dst = &rt->dst;
1803
1804         if (fi) {
1805                 if (FIB_RES_GW(*res) &&
1806                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1807                         rt->rt_gateway = FIB_RES_GW(*res);
1808                 rt_init_metrics(rt, fl4, fi);
1809 #ifdef CONFIG_IP_ROUTE_CLASSID
1810                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1811 #endif
1812         }
1813
1814         if (dst_mtu(dst) > IP_MAX_MTU)
1815                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1816         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1817                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1818
1819 #ifdef CONFIG_IP_ROUTE_CLASSID
1820 #ifdef CONFIG_IP_MULTIPLE_TABLES
1821         set_class_tag(rt, fib_rules_tclass(res));
1822 #endif
1823         set_class_tag(rt, itag);
1824 #endif
1825 }
1826
1827 static struct rtable *rt_dst_alloc(struct net_device *dev,
1828                                    bool nopolicy, bool noxfrm)
1829 {
1830         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1831                          DST_HOST |
1832                          (nopolicy ? DST_NOPOLICY : 0) |
1833                          (noxfrm ? DST_NOXFRM : 0));
1834 }
1835
1836 /* called in rcu_read_lock() section */
1837 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1838                                 u8 tos, struct net_device *dev, int our)
1839 {
1840         unsigned int hash;
1841         struct rtable *rth;
1842         __be32 spec_dst;
1843         struct in_device *in_dev = __in_dev_get_rcu(dev);
1844         u32 itag = 0;
1845         int err;
1846
1847         /* Primary sanity checks. */
1848
1849         if (in_dev == NULL)
1850                 return -EINVAL;
1851
1852         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1853             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1854                 goto e_inval;
1855
1856         if (ipv4_is_zeronet(saddr)) {
1857                 if (!ipv4_is_local_multicast(daddr))
1858                         goto e_inval;
1859                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1860         } else {
1861                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1862                                           &itag);
1863                 if (err < 0)
1864                         goto e_err;
1865         }
1866         rth = rt_dst_alloc(init_net.loopback_dev,
1867                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1868         if (!rth)
1869                 goto e_nobufs;
1870
1871 #ifdef CONFIG_IP_ROUTE_CLASSID
1872         rth->dst.tclassid = itag;
1873 #endif
1874         rth->dst.output = ip_rt_bug;
1875
1876         rth->rt_key_dst = daddr;
1877         rth->rt_key_src = saddr;
1878         rth->rt_genid   = rt_genid(dev_net(dev));
1879         rth->rt_flags   = RTCF_MULTICAST;
1880         rth->rt_type    = RTN_MULTICAST;
1881         rth->rt_key_tos = tos;
1882         rth->rt_dst     = daddr;
1883         rth->rt_src     = saddr;
1884         rth->rt_route_iif = dev->ifindex;
1885         rth->rt_iif     = dev->ifindex;
1886         rth->rt_oif     = 0;
1887         rth->rt_mark    = skb->mark;
1888         rth->rt_gateway = daddr;
1889         rth->rt_spec_dst= spec_dst;
1890         rth->rt_peer_genid = 0;
1891         rth->peer = NULL;
1892         rth->fi = NULL;
1893         if (our) {
1894                 rth->dst.input= ip_local_deliver;
1895                 rth->rt_flags |= RTCF_LOCAL;
1896         }
1897
1898 #ifdef CONFIG_IP_MROUTE
1899         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1900                 rth->dst.input = ip_mr_input;
1901 #endif
1902         RT_CACHE_STAT_INC(in_slow_mc);
1903
1904         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1905         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1906         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1907
1908 e_nobufs:
1909         return -ENOBUFS;
1910 e_inval:
1911         return -EINVAL;
1912 e_err:
1913         return err;
1914 }
1915
1916
1917 static void ip_handle_martian_source(struct net_device *dev,
1918                                      struct in_device *in_dev,
1919                                      struct sk_buff *skb,
1920                                      __be32 daddr,
1921                                      __be32 saddr)
1922 {
1923         RT_CACHE_STAT_INC(in_martian_src);
1924 #ifdef CONFIG_IP_ROUTE_VERBOSE
1925         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1926                 /*
1927                  *      RFC1812 recommendation, if source is martian,
1928                  *      the only hint is MAC header.
1929                  */
1930                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1931                         &daddr, &saddr, dev->name);
1932                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1933                         int i;
1934                         const unsigned char *p = skb_mac_header(skb);
1935                         printk(KERN_WARNING "ll header: ");
1936                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1937                                 printk("%02x", *p);
1938                                 if (i < (dev->hard_header_len - 1))
1939                                         printk(":");
1940                         }
1941                         printk("\n");
1942                 }
1943         }
1944 #endif
1945 }
1946
1947 /* called in rcu_read_lock() section */
1948 static int __mkroute_input(struct sk_buff *skb,
1949                            const struct fib_result *res,
1950                            struct in_device *in_dev,
1951                            __be32 daddr, __be32 saddr, u32 tos,
1952                            struct rtable **result)
1953 {
1954         struct rtable *rth;
1955         int err;
1956         struct in_device *out_dev;
1957         unsigned int flags = 0;
1958         __be32 spec_dst;
1959         u32 itag;
1960
1961         /* get a working reference to the output device */
1962         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1963         if (out_dev == NULL) {
1964                 if (net_ratelimit())
1965                         printk(KERN_CRIT "Bug in ip_route_input" \
1966                                "_slow(). Please, report\n");
1967                 return -EINVAL;
1968         }
1969
1970
1971         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1972                                   in_dev->dev, &spec_dst, &itag);
1973         if (err < 0) {
1974                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1975                                          saddr);
1976
1977                 goto cleanup;
1978         }
1979
1980         if (err)
1981                 flags |= RTCF_DIRECTSRC;
1982
1983         if (out_dev == in_dev && err &&
1984             (IN_DEV_SHARED_MEDIA(out_dev) ||
1985              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1986                 flags |= RTCF_DOREDIRECT;
1987
1988         if (skb->protocol != htons(ETH_P_IP)) {
1989                 /* Not IP (i.e. ARP). Do not create route, if it is
1990                  * invalid for proxy arp. DNAT routes are always valid.
1991                  *
1992                  * Proxy arp feature have been extended to allow, ARP
1993                  * replies back to the same interface, to support
1994                  * Private VLAN switch technologies. See arp.c.
1995                  */
1996                 if (out_dev == in_dev &&
1997                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1998                         err = -EINVAL;
1999                         goto cleanup;
2000                 }
2001         }
2002
2003         rth = rt_dst_alloc(out_dev->dev,
2004                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2005                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2006         if (!rth) {
2007                 err = -ENOBUFS;
2008                 goto cleanup;
2009         }
2010
2011         rth->rt_key_dst = daddr;
2012         rth->rt_key_src = saddr;
2013         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2014         rth->rt_flags = flags;
2015         rth->rt_type = res->type;
2016         rth->rt_key_tos = tos;
2017         rth->rt_dst     = daddr;
2018         rth->rt_src     = saddr;
2019         rth->rt_route_iif = in_dev->dev->ifindex;
2020         rth->rt_iif     = in_dev->dev->ifindex;
2021         rth->rt_oif     = 0;
2022         rth->rt_mark    = skb->mark;
2023         rth->rt_gateway = daddr;
2024         rth->rt_spec_dst= spec_dst;
2025         rth->rt_peer_genid = 0;
2026         rth->peer = NULL;
2027         rth->fi = NULL;
2028
2029         rth->dst.input = ip_forward;
2030         rth->dst.output = ip_output;
2031
2032         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2033
2034         *result = rth;
2035         err = 0;
2036  cleanup:
2037         return err;
2038 }
2039
2040 static int ip_mkroute_input(struct sk_buff *skb,
2041                             struct fib_result *res,
2042                             const struct flowi4 *fl4,
2043                             struct in_device *in_dev,
2044                             __be32 daddr, __be32 saddr, u32 tos)
2045 {
2046         struct rtable* rth = NULL;
2047         int err;
2048         unsigned hash;
2049
2050 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2051         if (res->fi && res->fi->fib_nhs > 1)
2052                 fib_select_multipath(res);
2053 #endif
2054
2055         /* create a routing cache entry */
2056         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2057         if (err)
2058                 return err;
2059
2060         /* put it into the cache */
2061         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2062                        rt_genid(dev_net(rth->dst.dev)));
2063         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2064         if (IS_ERR(rth))
2065                 return PTR_ERR(rth);
2066         return 0;
2067 }
2068
2069 /*
2070  *      NOTE. We drop all the packets that has local source
2071  *      addresses, because every properly looped back packet
2072  *      must have correct destination already attached by output routine.
2073  *
2074  *      Such approach solves two big problems:
2075  *      1. Not simplex devices are handled properly.
2076  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2077  *      called with rcu_read_lock()
2078  */
2079
2080 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081                                u8 tos, struct net_device *dev)
2082 {
2083         struct fib_result res;
2084         struct in_device *in_dev = __in_dev_get_rcu(dev);
2085         struct flowi4   fl4;
2086         unsigned        flags = 0;
2087         u32             itag = 0;
2088         struct rtable * rth;
2089         unsigned        hash;
2090         __be32          spec_dst;
2091         int             err = -EINVAL;
2092         struct net    * net = dev_net(dev);
2093
2094         /* IP on this device is disabled. */
2095
2096         if (!in_dev)
2097                 goto out;
2098
2099         /* Check for the most weird martians, which can be not detected
2100            by fib_lookup.
2101          */
2102
2103         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2104             ipv4_is_loopback(saddr))
2105                 goto martian_source;
2106
2107         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2108                 goto brd_input;
2109
2110         /* Accept zero addresses only to limited broadcast;
2111          * I even do not know to fix it or not. Waiting for complains :-)
2112          */
2113         if (ipv4_is_zeronet(saddr))
2114                 goto martian_source;
2115
2116         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2117                 goto martian_destination;
2118
2119         /*
2120          *      Now we are ready to route packet.
2121          */
2122         fl4.flowi4_oif = 0;
2123         fl4.flowi4_iif = dev->ifindex;
2124         fl4.flowi4_mark = skb->mark;
2125         fl4.flowi4_tos = tos;
2126         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2127         fl4.daddr = daddr;
2128         fl4.saddr = saddr;
2129         err = fib_lookup(net, &fl4, &res);
2130         if (err != 0) {
2131                 if (!IN_DEV_FORWARD(in_dev))
2132                         goto e_hostunreach;
2133                 goto no_route;
2134         }
2135
2136         RT_CACHE_STAT_INC(in_slow_tot);
2137
2138         if (res.type == RTN_BROADCAST)
2139                 goto brd_input;
2140
2141         if (res.type == RTN_LOCAL) {
2142                 err = fib_validate_source(skb, saddr, daddr, tos,
2143                                           net->loopback_dev->ifindex,
2144                                           dev, &spec_dst, &itag);
2145                 if (err < 0)
2146                         goto martian_source_keep_err;
2147                 if (err)
2148                         flags |= RTCF_DIRECTSRC;
2149                 spec_dst = daddr;
2150                 goto local_input;
2151         }
2152
2153         if (!IN_DEV_FORWARD(in_dev))
2154                 goto e_hostunreach;
2155         if (res.type != RTN_UNICAST)
2156                 goto martian_destination;
2157
2158         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2159 out:    return err;
2160
2161 brd_input:
2162         if (skb->protocol != htons(ETH_P_IP))
2163                 goto e_inval;
2164
2165         if (ipv4_is_zeronet(saddr))
2166                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2167         else {
2168                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2169                                           &itag);
2170                 if (err < 0)
2171                         goto martian_source_keep_err;
2172                 if (err)
2173                         flags |= RTCF_DIRECTSRC;
2174         }
2175         flags |= RTCF_BROADCAST;
2176         res.type = RTN_BROADCAST;
2177         RT_CACHE_STAT_INC(in_brd);
2178
2179 local_input:
2180         rth = rt_dst_alloc(net->loopback_dev,
2181                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2182         if (!rth)
2183                 goto e_nobufs;
2184
2185         rth->dst.input= ip_local_deliver;
2186         rth->dst.output= ip_rt_bug;
2187 #ifdef CONFIG_IP_ROUTE_CLASSID
2188         rth->dst.tclassid = itag;
2189 #endif
2190
2191         rth->rt_key_dst = daddr;
2192         rth->rt_key_src = saddr;
2193         rth->rt_genid = rt_genid(net);
2194         rth->rt_flags   = flags|RTCF_LOCAL;
2195         rth->rt_type    = res.type;
2196         rth->rt_key_tos = tos;
2197         rth->rt_dst     = daddr;
2198         rth->rt_src     = saddr;
2199 #ifdef CONFIG_IP_ROUTE_CLASSID
2200         rth->dst.tclassid = itag;
2201 #endif
2202         rth->rt_route_iif = dev->ifindex;
2203         rth->rt_iif     = dev->ifindex;
2204         rth->rt_oif     = 0;
2205         rth->rt_mark    = skb->mark;
2206         rth->rt_gateway = daddr;
2207         rth->rt_spec_dst= spec_dst;
2208         rth->rt_peer_genid = 0;
2209         rth->peer = NULL;
2210         rth->fi = NULL;
2211         if (res.type == RTN_UNREACHABLE) {
2212                 rth->dst.input= ip_error;
2213                 rth->dst.error= -err;
2214                 rth->rt_flags   &= ~RTCF_LOCAL;
2215         }
2216         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2217         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2218         err = 0;
2219         if (IS_ERR(rth))
2220                 err = PTR_ERR(rth);
2221         goto out;
2222
2223 no_route:
2224         RT_CACHE_STAT_INC(in_no_route);
2225         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2226         res.type = RTN_UNREACHABLE;
2227         if (err == -ESRCH)
2228                 err = -ENETUNREACH;
2229         goto local_input;
2230
2231         /*
2232          *      Do not cache martian addresses: they should be logged (RFC1812)
2233          */
2234 martian_destination:
2235         RT_CACHE_STAT_INC(in_martian_dst);
2236 #ifdef CONFIG_IP_ROUTE_VERBOSE
2237         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2238                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2239                         &daddr, &saddr, dev->name);
2240 #endif
2241
2242 e_hostunreach:
2243         err = -EHOSTUNREACH;
2244         goto out;
2245
2246 e_inval:
2247         err = -EINVAL;
2248         goto out;
2249
2250 e_nobufs:
2251         err = -ENOBUFS;
2252         goto out;
2253
2254 martian_source:
2255         err = -EINVAL;
2256 martian_source_keep_err:
2257         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2258         goto out;
2259 }
2260
2261 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2262                            u8 tos, struct net_device *dev, bool noref)
2263 {
2264         struct rtable * rth;
2265         unsigned        hash;
2266         int iif = dev->ifindex;
2267         struct net *net;
2268         int res;
2269
2270         net = dev_net(dev);
2271
2272         rcu_read_lock();
2273
2274         if (!rt_caching(net))
2275                 goto skip_cache;
2276
2277         tos &= IPTOS_RT_MASK;
2278         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2279
2280         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2281              rth = rcu_dereference(rth->dst.rt_next)) {
2282                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2283                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2284                      (rth->rt_iif ^ iif) |
2285                      rth->rt_oif |
2286                      (rth->rt_key_tos ^ tos)) == 0 &&
2287                     rth->rt_mark == skb->mark &&
2288                     net_eq(dev_net(rth->dst.dev), net) &&
2289                     !rt_is_expired(rth)) {
2290                         if (noref) {
2291                                 dst_use_noref(&rth->dst, jiffies);
2292                                 skb_dst_set_noref(skb, &rth->dst);
2293                         } else {
2294                                 dst_use(&rth->dst, jiffies);
2295                                 skb_dst_set(skb, &rth->dst);
2296                         }
2297                         RT_CACHE_STAT_INC(in_hit);
2298                         rcu_read_unlock();
2299                         return 0;
2300                 }
2301                 RT_CACHE_STAT_INC(in_hlist_search);
2302         }
2303
2304 skip_cache:
2305         /* Multicast recognition logic is moved from route cache to here.
2306            The problem was that too many Ethernet cards have broken/missing
2307            hardware multicast filters :-( As result the host on multicasting
2308            network acquires a lot of useless route cache entries, sort of
2309            SDR messages from all the world. Now we try to get rid of them.
2310            Really, provided software IP multicast filter is organized
2311            reasonably (at least, hashed), it does not result in a slowdown
2312            comparing with route cache reject entries.
2313            Note, that multicast routers are not affected, because
2314            route cache entry is created eventually.
2315          */
2316         if (ipv4_is_multicast(daddr)) {
2317                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2318
2319                 if (in_dev) {
2320                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2321                                                   ip_hdr(skb)->protocol);
2322                         if (our
2323 #ifdef CONFIG_IP_MROUTE
2324                                 ||
2325                             (!ipv4_is_local_multicast(daddr) &&
2326                              IN_DEV_MFORWARD(in_dev))
2327 #endif
2328                            ) {
2329                                 int res = ip_route_input_mc(skb, daddr, saddr,
2330                                                             tos, dev, our);
2331                                 rcu_read_unlock();
2332                                 return res;
2333                         }
2334                 }
2335                 rcu_read_unlock();
2336                 return -EINVAL;
2337         }
2338         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2339         rcu_read_unlock();
2340         return res;
2341 }
2342 EXPORT_SYMBOL(ip_route_input_common);
2343
2344 /* called with rcu_read_lock() */
2345 static struct rtable *__mkroute_output(const struct fib_result *res,
2346                                        const struct flowi4 *fl4,
2347                                        __be32 orig_daddr, __be32 orig_saddr,
2348                                        int orig_oif, struct net_device *dev_out,
2349                                        unsigned int flags)
2350 {
2351         struct fib_info *fi = res->fi;
2352         u32 tos = RT_FL_TOS(fl4);
2353         struct in_device *in_dev;
2354         u16 type = res->type;
2355         struct rtable *rth;
2356
2357         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2358                 return ERR_PTR(-EINVAL);
2359
2360         if (ipv4_is_lbcast(fl4->daddr))
2361                 type = RTN_BROADCAST;
2362         else if (ipv4_is_multicast(fl4->daddr))
2363                 type = RTN_MULTICAST;
2364         else if (ipv4_is_zeronet(fl4->daddr))
2365                 return ERR_PTR(-EINVAL);
2366
2367         if (dev_out->flags & IFF_LOOPBACK)
2368                 flags |= RTCF_LOCAL;
2369
2370         in_dev = __in_dev_get_rcu(dev_out);
2371         if (!in_dev)
2372                 return ERR_PTR(-EINVAL);
2373
2374         if (type == RTN_BROADCAST) {
2375                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2376                 fi = NULL;
2377         } else if (type == RTN_MULTICAST) {
2378                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2379                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2380                                      fl4->flowi4_proto))
2381                         flags &= ~RTCF_LOCAL;
2382                 /* If multicast route do not exist use
2383                  * default one, but do not gateway in this case.
2384                  * Yes, it is hack.
2385                  */
2386                 if (fi && res->prefixlen < 4)
2387                         fi = NULL;
2388         }
2389
2390         rth = rt_dst_alloc(dev_out,
2391                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2392                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2393         if (!rth)
2394                 return ERR_PTR(-ENOBUFS);
2395
2396         rth->dst.output = ip_output;
2397
2398         rth->rt_key_dst = orig_daddr;
2399         rth->rt_key_src = orig_saddr;
2400         rth->rt_genid = rt_genid(dev_net(dev_out));
2401         rth->rt_flags   = flags;
2402         rth->rt_type    = type;
2403         rth->rt_key_tos = tos;
2404         rth->rt_dst     = fl4->daddr;
2405         rth->rt_src     = fl4->saddr;
2406         rth->rt_route_iif = 0;
2407         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2408         rth->rt_oif     = orig_oif;
2409         rth->rt_mark    = fl4->flowi4_mark;
2410         rth->rt_gateway = fl4->daddr;
2411         rth->rt_spec_dst= fl4->saddr;
2412         rth->rt_peer_genid = 0;
2413         rth->peer = NULL;
2414         rth->fi = NULL;
2415
2416         RT_CACHE_STAT_INC(out_slow_tot);
2417
2418         if (flags & RTCF_LOCAL) {
2419                 rth->dst.input = ip_local_deliver;
2420                 rth->rt_spec_dst = fl4->daddr;
2421         }
2422         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2423                 rth->rt_spec_dst = fl4->saddr;
2424                 if (flags & RTCF_LOCAL &&
2425                     !(dev_out->flags & IFF_LOOPBACK)) {
2426                         rth->dst.output = ip_mc_output;
2427                         RT_CACHE_STAT_INC(out_slow_mc);
2428                 }
2429 #ifdef CONFIG_IP_MROUTE
2430                 if (type == RTN_MULTICAST) {
2431                         if (IN_DEV_MFORWARD(in_dev) &&
2432                             !ipv4_is_local_multicast(fl4->daddr)) {
2433                                 rth->dst.input = ip_mr_input;
2434                                 rth->dst.output = ip_mc_output;
2435                         }
2436                 }
2437 #endif
2438         }
2439
2440         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2441
2442         return rth;
2443 }
2444
2445 /*
2446  * Major route resolver routine.
2447  * called with rcu_read_lock();
2448  */
2449
2450 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2451 {
2452         struct net_device *dev_out = NULL;
2453         u32 tos = RT_FL_TOS(fl4);
2454         unsigned int flags = 0;
2455         struct fib_result res;
2456         struct rtable *rth;
2457         __be32 orig_daddr;
2458         __be32 orig_saddr;
2459         int orig_oif;
2460
2461         res.fi          = NULL;
2462 #ifdef CONFIG_IP_MULTIPLE_TABLES
2463         res.r           = NULL;
2464 #endif
2465
2466         orig_daddr = fl4->daddr;
2467         orig_saddr = fl4->saddr;
2468         orig_oif = fl4->flowi4_oif;
2469
2470         fl4->flowi4_iif = net->loopback_dev->ifindex;
2471         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2472         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2473                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2474
2475         rcu_read_lock();
2476         if (fl4->saddr) {
2477                 rth = ERR_PTR(-EINVAL);
2478                 if (ipv4_is_multicast(fl4->saddr) ||
2479                     ipv4_is_lbcast(fl4->saddr) ||
2480                     ipv4_is_zeronet(fl4->saddr))
2481                         goto out;
2482
2483                 /* I removed check for oif == dev_out->oif here.
2484                    It was wrong for two reasons:
2485                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2486                       is assigned to multiple interfaces.
2487                    2. Moreover, we are allowed to send packets with saddr
2488                       of another iface. --ANK
2489                  */
2490
2491                 if (fl4->flowi4_oif == 0 &&
2492                     (ipv4_is_multicast(fl4->daddr) ||
2493                      ipv4_is_lbcast(fl4->daddr))) {
2494                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2495                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2496                         if (dev_out == NULL)
2497                                 goto out;
2498
2499                         /* Special hack: user can direct multicasts
2500                            and limited broadcast via necessary interface
2501                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2502                            This hack is not just for fun, it allows
2503                            vic,vat and friends to work.
2504                            They bind socket to loopback, set ttl to zero
2505                            and expect that it will work.
2506                            From the viewpoint of routing cache they are broken,
2507                            because we are not allowed to build multicast path
2508                            with loopback source addr (look, routing cache
2509                            cannot know, that ttl is zero, so that packet
2510                            will not leave this host and route is valid).
2511                            Luckily, this hack is good workaround.
2512                          */
2513
2514                         fl4->flowi4_oif = dev_out->ifindex;
2515                         goto make_route;
2516                 }
2517
2518                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2519                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2520                         if (!__ip_dev_find(net, fl4->saddr, false))
2521                                 goto out;
2522                 }
2523         }
2524
2525
2526         if (fl4->flowi4_oif) {
2527                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2528                 rth = ERR_PTR(-ENODEV);
2529                 if (dev_out == NULL)
2530                         goto out;
2531
2532                 /* RACE: Check return value of inet_select_addr instead. */
2533                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2534                         rth = ERR_PTR(-ENETUNREACH);
2535                         goto out;
2536                 }
2537                 if (ipv4_is_local_multicast(fl4->daddr) ||
2538                     ipv4_is_lbcast(fl4->daddr)) {
2539                         if (!fl4->saddr)
2540                                 fl4->saddr = inet_select_addr(dev_out, 0,
2541                                                               RT_SCOPE_LINK);
2542                         goto make_route;
2543                 }
2544                 if (fl4->saddr) {
2545                         if (ipv4_is_multicast(fl4->daddr))
2546                                 fl4->saddr = inet_select_addr(dev_out, 0,
2547                                                               fl4->flowi4_scope);
2548                         else if (!fl4->daddr)
2549                                 fl4->saddr = inet_select_addr(dev_out, 0,
2550                                                               RT_SCOPE_HOST);
2551                 }
2552         }
2553
2554         if (!fl4->daddr) {
2555                 fl4->daddr = fl4->saddr;
2556                 if (!fl4->daddr)
2557                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2558                 dev_out = net->loopback_dev;
2559                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2560                 res.type = RTN_LOCAL;
2561                 flags |= RTCF_LOCAL;
2562                 goto make_route;
2563         }
2564
2565         if (fib_lookup(net, fl4, &res)) {
2566                 res.fi = NULL;
2567                 if (fl4->flowi4_oif) {
2568                         /* Apparently, routing tables are wrong. Assume,
2569                            that the destination is on link.
2570
2571                            WHY? DW.
2572                            Because we are allowed to send to iface
2573                            even if it has NO routes and NO assigned
2574                            addresses. When oif is specified, routing
2575                            tables are looked up with only one purpose:
2576                            to catch if destination is gatewayed, rather than
2577                            direct. Moreover, if MSG_DONTROUTE is set,
2578                            we send packet, ignoring both routing tables
2579                            and ifaddr state. --ANK
2580
2581
2582                            We could make it even if oif is unknown,
2583                            likely IPv6, but we do not.
2584                          */
2585
2586                         if (fl4->saddr == 0)
2587                                 fl4->saddr = inet_select_addr(dev_out, 0,
2588                                                               RT_SCOPE_LINK);
2589                         res.type = RTN_UNICAST;
2590                         goto make_route;
2591                 }
2592                 rth = ERR_PTR(-ENETUNREACH);
2593                 goto out;
2594         }
2595
2596         if (res.type == RTN_LOCAL) {
2597                 if (!fl4->saddr) {
2598                         if (res.fi->fib_prefsrc)
2599                                 fl4->saddr = res.fi->fib_prefsrc;
2600                         else
2601                                 fl4->saddr = fl4->daddr;
2602                 }
2603                 dev_out = net->loopback_dev;
2604                 fl4->flowi4_oif = dev_out->ifindex;
2605                 res.fi = NULL;
2606                 flags |= RTCF_LOCAL;
2607                 goto make_route;
2608         }
2609
2610 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2611         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2612                 fib_select_multipath(&res);
2613         else
2614 #endif
2615         if (!res.prefixlen &&
2616             res.table->tb_num_default > 1 &&
2617             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2618                 fib_select_default(&res);
2619
2620         if (!fl4->saddr)
2621                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2622
2623         dev_out = FIB_RES_DEV(res);
2624         fl4->flowi4_oif = dev_out->ifindex;
2625
2626
2627 make_route:
2628         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2629                                dev_out, flags);
2630         if (!IS_ERR(rth)) {
2631                 unsigned int hash;
2632
2633                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2634                                rt_genid(dev_net(dev_out)));
2635                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2636         }
2637
2638 out:
2639         rcu_read_unlock();
2640         return rth;
2641 }
2642
2643 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2644 {
2645         struct rtable *rth;
2646         unsigned int hash;
2647
2648         if (!rt_caching(net))
2649                 goto slow_output;
2650
2651         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2652
2653         rcu_read_lock_bh();
2654         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2655                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2656                 if (rth->rt_key_dst == flp4->daddr &&
2657                     rth->rt_key_src == flp4->saddr &&
2658                     rt_is_output_route(rth) &&
2659                     rth->rt_oif == flp4->flowi4_oif &&
2660                     rth->rt_mark == flp4->flowi4_mark &&
2661                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2662                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2663                     net_eq(dev_net(rth->dst.dev), net) &&
2664                     !rt_is_expired(rth)) {
2665                         dst_use(&rth->dst, jiffies);
2666                         RT_CACHE_STAT_INC(out_hit);
2667                         rcu_read_unlock_bh();
2668                         if (!flp4->saddr)
2669                                 flp4->saddr = rth->rt_src;
2670                         if (!flp4->daddr)
2671                                 flp4->daddr = rth->rt_dst;
2672                         return rth;
2673                 }
2674                 RT_CACHE_STAT_INC(out_hlist_search);
2675         }
2676         rcu_read_unlock_bh();
2677
2678 slow_output:
2679         return ip_route_output_slow(net, flp4);
2680 }
2681 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2682
2683 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2684 {
2685         return NULL;
2686 }
2687
2688 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2689 {
2690         return 0;
2691 }
2692
2693 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2694 {
2695 }
2696
2697 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2698                                           unsigned long old)
2699 {
2700         return NULL;
2701 }
2702
2703 static struct dst_ops ipv4_dst_blackhole_ops = {
2704         .family                 =       AF_INET,
2705         .protocol               =       cpu_to_be16(ETH_P_IP),
2706         .destroy                =       ipv4_dst_destroy,
2707         .check                  =       ipv4_blackhole_dst_check,
2708         .default_mtu            =       ipv4_blackhole_default_mtu,
2709         .default_advmss         =       ipv4_default_advmss,
2710         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2711         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2712 };
2713
2714 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2715 {
2716         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2717         struct rtable *ort = (struct rtable *) dst_orig;
2718
2719         if (rt) {
2720                 struct dst_entry *new = &rt->dst;
2721
2722                 new->__use = 1;
2723                 new->input = dst_discard;
2724                 new->output = dst_discard;
2725                 dst_copy_metrics(new, &ort->dst);
2726
2727                 new->dev = ort->dst.dev;
2728                 if (new->dev)
2729                         dev_hold(new->dev);
2730
2731                 rt->rt_key_dst = ort->rt_key_dst;
2732                 rt->rt_key_src = ort->rt_key_src;
2733                 rt->rt_key_tos = ort->rt_key_tos;
2734                 rt->rt_route_iif = ort->rt_route_iif;
2735                 rt->rt_iif = ort->rt_iif;
2736                 rt->rt_oif = ort->rt_oif;
2737                 rt->rt_mark = ort->rt_mark;
2738
2739                 rt->rt_genid = rt_genid(net);
2740                 rt->rt_flags = ort->rt_flags;
2741                 rt->rt_type = ort->rt_type;
2742                 rt->rt_dst = ort->rt_dst;
2743                 rt->rt_src = ort->rt_src;
2744                 rt->rt_gateway = ort->rt_gateway;
2745                 rt->rt_spec_dst = ort->rt_spec_dst;
2746                 rt->peer = ort->peer;
2747                 if (rt->peer)
2748                         atomic_inc(&rt->peer->refcnt);
2749                 rt->fi = ort->fi;
2750                 if (rt->fi)
2751                         atomic_inc(&rt->fi->fib_clntref);
2752
2753                 dst_free(new);
2754         }
2755
2756         dst_release(dst_orig);
2757
2758         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2759 }
2760
2761 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2762                                     struct sock *sk)
2763 {
2764         struct rtable *rt = __ip_route_output_key(net, flp4);
2765
2766         if (IS_ERR(rt))
2767                 return rt;
2768
2769         if (flp4->flowi4_proto)
2770                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2771                                                    flowi4_to_flowi(flp4),
2772                                                    sk, 0);
2773
2774         return rt;
2775 }
2776 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2777
2778 static int rt_fill_info(struct net *net,
2779                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2780                         int nowait, unsigned int flags)
2781 {
2782         struct rtable *rt = skb_rtable(skb);
2783         struct rtmsg *r;
2784         struct nlmsghdr *nlh;
2785         long expires = 0;
2786         const struct inet_peer *peer = rt->peer;
2787         u32 id = 0, ts = 0, tsage = 0, error;
2788
2789         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2790         if (nlh == NULL)
2791                 return -EMSGSIZE;
2792
2793         r = nlmsg_data(nlh);
2794         r->rtm_family    = AF_INET;
2795         r->rtm_dst_len  = 32;
2796         r->rtm_src_len  = 0;
2797         r->rtm_tos      = rt->rt_key_tos;
2798         r->rtm_table    = RT_TABLE_MAIN;
2799         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2800         r->rtm_type     = rt->rt_type;
2801         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2802         r->rtm_protocol = RTPROT_UNSPEC;
2803         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2804         if (rt->rt_flags & RTCF_NOTIFY)
2805                 r->rtm_flags |= RTM_F_NOTIFY;
2806
2807         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2808
2809         if (rt->rt_key_src) {
2810                 r->rtm_src_len = 32;
2811                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2812         }
2813         if (rt->dst.dev)
2814                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2815 #ifdef CONFIG_IP_ROUTE_CLASSID
2816         if (rt->dst.tclassid)
2817                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2818 #endif
2819         if (rt_is_input_route(rt))
2820                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2821         else if (rt->rt_src != rt->rt_key_src)
2822                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2823
2824         if (rt->rt_dst != rt->rt_gateway)
2825                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2826
2827         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2828                 goto nla_put_failure;
2829
2830         if (rt->rt_mark)
2831                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2832
2833         error = rt->dst.error;
2834         if (peer) {
2835                 inet_peer_refcheck(rt->peer);
2836                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2837                 if (peer->tcp_ts_stamp) {
2838                         ts = peer->tcp_ts;
2839                         tsage = get_seconds() - peer->tcp_ts_stamp;
2840                 }
2841                 expires = ACCESS_ONCE(peer->pmtu_expires);
2842                 if (expires)
2843                         expires -= jiffies;
2844         }
2845
2846         if (rt_is_input_route(rt)) {
2847 #ifdef CONFIG_IP_MROUTE
2848                 __be32 dst = rt->rt_dst;
2849
2850                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2851                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2852                         int err = ipmr_get_route(net, skb,
2853                                                  rt->rt_src, rt->rt_dst,
2854                                                  r, nowait);
2855                         if (err <= 0) {
2856                                 if (!nowait) {
2857                                         if (err == 0)
2858                                                 return 0;
2859                                         goto nla_put_failure;
2860                                 } else {
2861                                         if (err == -EMSGSIZE)
2862                                                 goto nla_put_failure;
2863                                         error = err;
2864                                 }
2865                         }
2866                 } else
2867 #endif
2868                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2869         }
2870
2871         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2872                                expires, error) < 0)
2873                 goto nla_put_failure;
2874
2875         return nlmsg_end(skb, nlh);
2876
2877 nla_put_failure:
2878         nlmsg_cancel(skb, nlh);
2879         return -EMSGSIZE;
2880 }
2881
2882 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2883 {
2884         struct net *net = sock_net(in_skb->sk);
2885         struct rtmsg *rtm;
2886         struct nlattr *tb[RTA_MAX+1];
2887         struct rtable *rt = NULL;
2888         __be32 dst = 0;
2889         __be32 src = 0;
2890         u32 iif;
2891         int err;
2892         int mark;
2893         struct sk_buff *skb;
2894
2895         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2896         if (err < 0)
2897                 goto errout;
2898
2899         rtm = nlmsg_data(nlh);
2900
2901         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2902         if (skb == NULL) {
2903                 err = -ENOBUFS;
2904                 goto errout;
2905         }
2906
2907         /* Reserve room for dummy headers, this skb can pass
2908            through good chunk of routing engine.
2909          */
2910         skb_reset_mac_header(skb);
2911         skb_reset_network_header(skb);
2912
2913         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2914         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2915         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2916
2917         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2918         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2919         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2920         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2921
2922         if (iif) {
2923                 struct net_device *dev;
2924
2925                 dev = __dev_get_by_index(net, iif);
2926                 if (dev == NULL) {
2927                         err = -ENODEV;
2928                         goto errout_free;
2929                 }
2930
2931                 skb->protocol   = htons(ETH_P_IP);
2932                 skb->dev        = dev;
2933                 skb->mark       = mark;
2934                 local_bh_disable();
2935                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2936                 local_bh_enable();
2937
2938                 rt = skb_rtable(skb);
2939                 if (err == 0 && rt->dst.error)
2940                         err = -rt->dst.error;
2941         } else {
2942                 struct flowi4 fl4 = {
2943                         .daddr = dst,
2944                         .saddr = src,
2945                         .flowi4_tos = rtm->rtm_tos,
2946                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2947                         .flowi4_mark = mark,
2948                 };
2949                 rt = ip_route_output_key(net, &fl4);
2950
2951                 err = 0;
2952                 if (IS_ERR(rt))
2953                         err = PTR_ERR(rt);
2954         }
2955
2956         if (err)
2957                 goto errout_free;
2958
2959         skb_dst_set(skb, &rt->dst);
2960         if (rtm->rtm_flags & RTM_F_NOTIFY)
2961                 rt->rt_flags |= RTCF_NOTIFY;
2962
2963         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2964                            RTM_NEWROUTE, 0, 0);
2965         if (err <= 0)
2966                 goto errout_free;
2967
2968         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2969 errout:
2970         return err;
2971
2972 errout_free:
2973         kfree_skb(skb);
2974         goto errout;
2975 }
2976
2977 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2978 {
2979         struct rtable *rt;
2980         int h, s_h;
2981         int idx, s_idx;
2982         struct net *net;
2983
2984         net = sock_net(skb->sk);
2985
2986         s_h = cb->args[0];
2987         if (s_h < 0)
2988                 s_h = 0;
2989         s_idx = idx = cb->args[1];
2990         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2991                 if (!rt_hash_table[h].chain)
2992                         continue;
2993                 rcu_read_lock_bh();
2994                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
2995                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2996                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
2997                                 continue;
2998                         if (rt_is_expired(rt))
2999                                 continue;
3000                         skb_dst_set_noref(skb, &rt->dst);
3001                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3002                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3003                                          1, NLM_F_MULTI) <= 0) {
3004                                 skb_dst_drop(skb);
3005                                 rcu_read_unlock_bh();
3006                                 goto done;
3007                         }
3008                         skb_dst_drop(skb);
3009                 }
3010                 rcu_read_unlock_bh();
3011         }
3012
3013 done:
3014         cb->args[0] = h;
3015         cb->args[1] = idx;
3016         return skb->len;
3017 }
3018
3019 void ip_rt_multicast_event(struct in_device *in_dev)
3020 {
3021         rt_cache_flush(dev_net(in_dev->dev), 0);
3022 }
3023
3024 #ifdef CONFIG_SYSCTL
3025 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3026                                         void __user *buffer,
3027                                         size_t *lenp, loff_t *ppos)
3028 {
3029         if (write) {
3030                 int flush_delay;
3031                 ctl_table ctl;
3032                 struct net *net;
3033
3034                 memcpy(&ctl, __ctl, sizeof(ctl));
3035                 ctl.data = &flush_delay;
3036                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3037
3038                 net = (struct net *)__ctl->extra1;
3039                 rt_cache_flush(net, flush_delay);
3040                 return 0;
3041         }
3042
3043         return -EINVAL;
3044 }
3045
3046 static ctl_table ipv4_route_table[] = {
3047         {
3048                 .procname       = "gc_thresh",
3049                 .data           = &ipv4_dst_ops.gc_thresh,
3050                 .maxlen         = sizeof(int),
3051                 .mode           = 0644,
3052                 .proc_handler   = proc_dointvec,
3053         },
3054         {
3055                 .procname       = "max_size",
3056                 .data           = &ip_rt_max_size,
3057                 .maxlen         = sizeof(int),
3058                 .mode           = 0644,
3059                 .proc_handler   = proc_dointvec,
3060         },
3061         {
3062                 /*  Deprecated. Use gc_min_interval_ms */
3063
3064                 .procname       = "gc_min_interval",
3065                 .data           = &ip_rt_gc_min_interval,
3066                 .maxlen         = sizeof(int),
3067                 .mode           = 0644,
3068                 .proc_handler   = proc_dointvec_jiffies,
3069         },
3070         {
3071                 .procname       = "gc_min_interval_ms",
3072                 .data           = &ip_rt_gc_min_interval,
3073                 .maxlen         = sizeof(int),
3074                 .mode           = 0644,
3075                 .proc_handler   = proc_dointvec_ms_jiffies,
3076         },
3077         {
3078                 .procname       = "gc_timeout",
3079                 .data           = &ip_rt_gc_timeout,
3080                 .maxlen         = sizeof(int),
3081                 .mode           = 0644,
3082                 .proc_handler   = proc_dointvec_jiffies,
3083         },
3084         {
3085                 .procname       = "gc_interval",
3086                 .data           = &ip_rt_gc_interval,
3087                 .maxlen         = sizeof(int),
3088                 .mode           = 0644,
3089                 .proc_handler   = proc_dointvec_jiffies,
3090         },
3091         {
3092                 .procname       = "redirect_load",
3093                 .data           = &ip_rt_redirect_load,
3094                 .maxlen         = sizeof(int),
3095                 .mode           = 0644,
3096                 .proc_handler   = proc_dointvec,
3097         },
3098         {
3099                 .procname       = "redirect_number",
3100                 .data           = &ip_rt_redirect_number,
3101                 .maxlen         = sizeof(int),
3102                 .mode           = 0644,
3103                 .proc_handler   = proc_dointvec,
3104         },
3105         {
3106                 .procname       = "redirect_silence",
3107                 .data           = &ip_rt_redirect_silence,
3108                 .maxlen         = sizeof(int),
3109                 .mode           = 0644,
3110                 .proc_handler   = proc_dointvec,
3111         },
3112         {
3113                 .procname       = "error_cost",
3114                 .data           = &ip_rt_error_cost,
3115                 .maxlen         = sizeof(int),
3116                 .mode           = 0644,
3117                 .proc_handler   = proc_dointvec,
3118         },
3119         {
3120                 .procname       = "error_burst",
3121                 .data           = &ip_rt_error_burst,
3122                 .maxlen         = sizeof(int),
3123                 .mode           = 0644,
3124                 .proc_handler   = proc_dointvec,
3125         },
3126         {
3127                 .procname       = "gc_elasticity",
3128                 .data           = &ip_rt_gc_elasticity,
3129                 .maxlen         = sizeof(int),
3130                 .mode           = 0644,
3131                 .proc_handler   = proc_dointvec,
3132         },
3133         {
3134                 .procname       = "mtu_expires",
3135                 .data           = &ip_rt_mtu_expires,
3136                 .maxlen         = sizeof(int),
3137                 .mode           = 0644,
3138                 .proc_handler   = proc_dointvec_jiffies,
3139         },
3140         {
3141                 .procname       = "min_pmtu",
3142                 .data           = &ip_rt_min_pmtu,
3143                 .maxlen         = sizeof(int),
3144                 .mode           = 0644,
3145                 .proc_handler   = proc_dointvec,
3146         },
3147         {
3148                 .procname       = "min_adv_mss",
3149                 .data           = &ip_rt_min_advmss,
3150                 .maxlen         = sizeof(int),
3151                 .mode           = 0644,
3152                 .proc_handler   = proc_dointvec,
3153         },
3154         { }
3155 };
3156
3157 static struct ctl_table empty[1];
3158
3159 static struct ctl_table ipv4_skeleton[] =
3160 {
3161         { .procname = "route", 
3162           .mode = 0555, .child = ipv4_route_table},
3163         { .procname = "neigh", 
3164           .mode = 0555, .child = empty},
3165         { }
3166 };
3167
3168 static __net_initdata struct ctl_path ipv4_path[] = {
3169         { .procname = "net", },
3170         { .procname = "ipv4", },
3171         { },
3172 };
3173
3174 static struct ctl_table ipv4_route_flush_table[] = {
3175         {
3176                 .procname       = "flush",
3177                 .maxlen         = sizeof(int),
3178                 .mode           = 0200,
3179                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3180         },
3181         { },
3182 };
3183
3184 static __net_initdata struct ctl_path ipv4_route_path[] = {
3185         { .procname = "net", },
3186         { .procname = "ipv4", },
3187         { .procname = "route", },
3188         { },
3189 };
3190
3191 static __net_init int sysctl_route_net_init(struct net *net)
3192 {
3193         struct ctl_table *tbl;
3194
3195         tbl = ipv4_route_flush_table;
3196         if (!net_eq(net, &init_net)) {
3197                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3198                 if (tbl == NULL)
3199                         goto err_dup;
3200         }
3201         tbl[0].extra1 = net;
3202
3203         net->ipv4.route_hdr =
3204                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3205         if (net->ipv4.route_hdr == NULL)
3206                 goto err_reg;
3207         return 0;
3208
3209 err_reg:
3210         if (tbl != ipv4_route_flush_table)
3211                 kfree(tbl);
3212 err_dup:
3213         return -ENOMEM;
3214 }
3215
3216 static __net_exit void sysctl_route_net_exit(struct net *net)
3217 {
3218         struct ctl_table *tbl;
3219
3220         tbl = net->ipv4.route_hdr->ctl_table_arg;
3221         unregister_net_sysctl_table(net->ipv4.route_hdr);
3222         BUG_ON(tbl == ipv4_route_flush_table);
3223         kfree(tbl);
3224 }
3225
3226 static __net_initdata struct pernet_operations sysctl_route_ops = {
3227         .init = sysctl_route_net_init,
3228         .exit = sysctl_route_net_exit,
3229 };
3230 #endif
3231
3232 static __net_init int rt_genid_init(struct net *net)
3233 {
3234         get_random_bytes(&net->ipv4.rt_genid,
3235                          sizeof(net->ipv4.rt_genid));
3236         get_random_bytes(&net->ipv4.dev_addr_genid,
3237                          sizeof(net->ipv4.dev_addr_genid));
3238         return 0;
3239 }
3240
3241 static __net_initdata struct pernet_operations rt_genid_ops = {
3242         .init = rt_genid_init,
3243 };
3244
3245
3246 #ifdef CONFIG_IP_ROUTE_CLASSID
3247 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3248 #endif /* CONFIG_IP_ROUTE_CLASSID */
3249
3250 static __initdata unsigned long rhash_entries;
3251 static int __init set_rhash_entries(char *str)
3252 {
3253         if (!str)
3254                 return 0;
3255         rhash_entries = simple_strtoul(str, &str, 0);
3256         return 1;
3257 }
3258 __setup("rhash_entries=", set_rhash_entries);
3259
3260 int __init ip_rt_init(void)
3261 {
3262         int rc = 0;
3263
3264 #ifdef CONFIG_IP_ROUTE_CLASSID
3265         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3266         if (!ip_rt_acct)
3267                 panic("IP: failed to allocate ip_rt_acct\n");
3268 #endif
3269
3270         ipv4_dst_ops.kmem_cachep =
3271                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3272                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3273
3274         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3275
3276         if (dst_entries_init(&ipv4_dst_ops) < 0)
3277                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3278
3279         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3280                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3281
3282         rt_hash_table = (struct rt_hash_bucket *)
3283                 alloc_large_system_hash("IP route cache",
3284                                         sizeof(struct rt_hash_bucket),
3285                                         rhash_entries,
3286                                         (totalram_pages >= 128 * 1024) ?
3287                                         15 : 17,
3288                                         0,
3289                                         &rt_hash_log,
3290                                         &rt_hash_mask,
3291                                         rhash_entries ? 0 : 512 * 1024);
3292         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3293         rt_hash_lock_init();
3294
3295         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3296         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3297
3298         devinet_init();
3299         ip_fib_init();
3300
3301         if (ip_rt_proc_init())
3302                 printk(KERN_ERR "Unable to create route proc files\n");
3303 #ifdef CONFIG_XFRM
3304         xfrm_init();
3305         xfrm4_init(ip_rt_max_size);
3306 #endif
3307         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3308
3309 #ifdef CONFIG_SYSCTL
3310         register_pernet_subsys(&sysctl_route_ops);
3311 #endif
3312         register_pernet_subsys(&rt_genid_ops);
3313         return rc;
3314 }
3315
3316 #ifdef CONFIG_SYSCTL
3317 /*
3318  * We really need to sanitize the damn ipv4 init order, then all
3319  * this nonsense will go away.
3320  */
3321 void __init ip_static_sysctl_init(void)
3322 {
3323         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3324 }
3325 #endif