]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv4/route.c
Merge tag 'for-linus' of git://github.com/prasad-joshi/logfs_upstream
[karo-tx-linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113
114 #define RT_FL_TOS(oldflp4) \
115         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116
117 #define IP_MAX_MTU      0xFFF0
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
125 static int ip_rt_redirect_number __read_mostly  = 9;
126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly       = HZ;
129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly    = 8;
131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly       = 256;
134 static int rt_chain_length_max __read_mostly    = 20;
135 static int redirect_genid;
136
137 static struct delayed_work expires_work;
138 static unsigned long expires_ljiffies;
139
140 /*
141  *      Interface to generic destination cache.
142  */
143
144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
146 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
147 static void              ipv4_dst_destroy(struct dst_entry *dst);
148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149 static void              ipv4_link_failure(struct sk_buff *skb);
150 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151 static int rt_garbage_collect(struct dst_ops *ops);
152
153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154                             int how)
155 {
156 }
157
158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 {
160         struct rtable *rt = (struct rtable *) dst;
161         struct inet_peer *peer;
162         u32 *p = NULL;
163
164         if (!rt->peer)
165                 rt_bind_peer(rt, rt->rt_dst, 1);
166
167         peer = rt->peer;
168         if (peer) {
169                 u32 *old_p = __DST_METRICS_PTR(old);
170                 unsigned long prev, new;
171
172                 p = peer->metrics;
173                 if (inet_metrics_new(peer))
174                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
175
176                 new = (unsigned long) p;
177                 prev = cmpxchg(&dst->_metrics, old, new);
178
179                 if (prev != old) {
180                         p = __DST_METRICS_PTR(prev);
181                         if (prev & DST_METRICS_READ_ONLY)
182                                 p = NULL;
183                 } else {
184                         if (rt->fi) {
185                                 fib_info_put(rt->fi);
186                                 rt->fi = NULL;
187                         }
188                 }
189         }
190         return p;
191 }
192
193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194
195 static struct dst_ops ipv4_dst_ops = {
196         .family =               AF_INET,
197         .protocol =             cpu_to_be16(ETH_P_IP),
198         .gc =                   rt_garbage_collect,
199         .check =                ipv4_dst_check,
200         .default_advmss =       ipv4_default_advmss,
201         .mtu =                  ipv4_mtu,
202         .cow_metrics =          ipv4_cow_metrics,
203         .destroy =              ipv4_dst_destroy,
204         .ifdown =               ipv4_dst_ifdown,
205         .negative_advice =      ipv4_negative_advice,
206         .link_failure =         ipv4_link_failure,
207         .update_pmtu =          ip_rt_update_pmtu,
208         .local_out =            __ip_local_out,
209         .neigh_lookup =         ipv4_neigh_lookup,
210 };
211
212 #define ECN_OR_COST(class)      TC_PRIO_##class
213
214 const __u8 ip_tos2prio[16] = {
215         TC_PRIO_BESTEFFORT,
216         ECN_OR_COST(BESTEFFORT),
217         TC_PRIO_BESTEFFORT,
218         ECN_OR_COST(BESTEFFORT),
219         TC_PRIO_BULK,
220         ECN_OR_COST(BULK),
221         TC_PRIO_BULK,
222         ECN_OR_COST(BULK),
223         TC_PRIO_INTERACTIVE,
224         ECN_OR_COST(INTERACTIVE),
225         TC_PRIO_INTERACTIVE,
226         ECN_OR_COST(INTERACTIVE),
227         TC_PRIO_INTERACTIVE_BULK,
228         ECN_OR_COST(INTERACTIVE_BULK),
229         TC_PRIO_INTERACTIVE_BULK,
230         ECN_OR_COST(INTERACTIVE_BULK)
231 };
232
233
234 /*
235  * Route cache.
236  */
237
238 /* The locking scheme is rather straight forward:
239  *
240  * 1) Read-Copy Update protects the buckets of the central route hash.
241  * 2) Only writers remove entries, and they hold the lock
242  *    as they look at rtable reference counts.
243  * 3) Only readers acquire references to rtable entries,
244  *    they do so with atomic increments and with the
245  *    lock held.
246  */
247
248 struct rt_hash_bucket {
249         struct rtable __rcu     *chain;
250 };
251
252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253         defined(CONFIG_PROVE_LOCKING)
254 /*
255  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256  * The size of this table is a power of two and depends on the number of CPUS.
257  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
258  */
259 #ifdef CONFIG_LOCKDEP
260 # define RT_HASH_LOCK_SZ        256
261 #else
262 # if NR_CPUS >= 32
263 #  define RT_HASH_LOCK_SZ       4096
264 # elif NR_CPUS >= 16
265 #  define RT_HASH_LOCK_SZ       2048
266 # elif NR_CPUS >= 8
267 #  define RT_HASH_LOCK_SZ       1024
268 # elif NR_CPUS >= 4
269 #  define RT_HASH_LOCK_SZ       512
270 # else
271 #  define RT_HASH_LOCK_SZ       256
272 # endif
273 #endif
274
275 static spinlock_t       *rt_hash_locks;
276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
277
278 static __init void rt_hash_lock_init(void)
279 {
280         int i;
281
282         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283                         GFP_KERNEL);
284         if (!rt_hash_locks)
285                 panic("IP: failed to allocate rt_hash_locks\n");
286
287         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288                 spin_lock_init(&rt_hash_locks[i]);
289 }
290 #else
291 # define rt_hash_lock_addr(slot) NULL
292
293 static inline void rt_hash_lock_init(void)
294 {
295 }
296 #endif
297
298 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
299 static unsigned                 rt_hash_mask __read_mostly;
300 static unsigned int             rt_hash_log  __read_mostly;
301
302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
304
305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
306                                    int genid)
307 {
308         return jhash_3words((__force u32)daddr, (__force u32)saddr,
309                             idx, genid)
310                 & rt_hash_mask;
311 }
312
313 static inline int rt_genid(struct net *net)
314 {
315         return atomic_read(&net->ipv4.rt_genid);
316 }
317
318 #ifdef CONFIG_PROC_FS
319 struct rt_cache_iter_state {
320         struct seq_net_private p;
321         int bucket;
322         int genid;
323 };
324
325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
326 {
327         struct rt_cache_iter_state *st = seq->private;
328         struct rtable *r = NULL;
329
330         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
331                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
332                         continue;
333                 rcu_read_lock_bh();
334                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
335                 while (r) {
336                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
337                             r->rt_genid == st->genid)
338                                 return r;
339                         r = rcu_dereference_bh(r->dst.rt_next);
340                 }
341                 rcu_read_unlock_bh();
342         }
343         return r;
344 }
345
346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
347                                           struct rtable *r)
348 {
349         struct rt_cache_iter_state *st = seq->private;
350
351         r = rcu_dereference_bh(r->dst.rt_next);
352         while (!r) {
353                 rcu_read_unlock_bh();
354                 do {
355                         if (--st->bucket < 0)
356                                 return NULL;
357                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
358                 rcu_read_lock_bh();
359                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360         }
361         return r;
362 }
363
364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
365                                         struct rtable *r)
366 {
367         struct rt_cache_iter_state *st = seq->private;
368         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
369                 if (dev_net(r->dst.dev) != seq_file_net(seq))
370                         continue;
371                 if (r->rt_genid == st->genid)
372                         break;
373         }
374         return r;
375 }
376
377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
378 {
379         struct rtable *r = rt_cache_get_first(seq);
380
381         if (r)
382                 while (pos && (r = rt_cache_get_next(seq, r)))
383                         --pos;
384         return pos ? NULL : r;
385 }
386
387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388 {
389         struct rt_cache_iter_state *st = seq->private;
390         if (*pos)
391                 return rt_cache_get_idx(seq, *pos - 1);
392         st->genid = rt_genid(seq_file_net(seq));
393         return SEQ_START_TOKEN;
394 }
395
396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397 {
398         struct rtable *r;
399
400         if (v == SEQ_START_TOKEN)
401                 r = rt_cache_get_first(seq);
402         else
403                 r = rt_cache_get_next(seq, v);
404         ++*pos;
405         return r;
406 }
407
408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409 {
410         if (v && v != SEQ_START_TOKEN)
411                 rcu_read_unlock_bh();
412 }
413
414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
415 {
416         if (v == SEQ_START_TOKEN)
417                 seq_printf(seq, "%-127s\n",
418                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420                            "HHUptod\tSpecDst");
421         else {
422                 struct rtable *r = v;
423                 struct neighbour *n;
424                 int len, HHUptod;
425
426                 rcu_read_lock();
427                 n = dst_get_neighbour_noref(&r->dst);
428                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429                 rcu_read_unlock();
430
431                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
433                         r->dst.dev ? r->dst.dev->name : "*",
434                         (__force u32)r->rt_dst,
435                         (__force u32)r->rt_gateway,
436                         r->rt_flags, atomic_read(&r->dst.__refcnt),
437                         r->dst.__use, 0, (__force u32)r->rt_src,
438                         dst_metric_advmss(&r->dst) + 40,
439                         dst_metric(&r->dst, RTAX_WINDOW),
440                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441                               dst_metric(&r->dst, RTAX_RTTVAR)),
442                         r->rt_key_tos,
443                         -1,
444                         HHUptod,
445                         r->rt_spec_dst, &len);
446
447                 seq_printf(seq, "%*s\n", 127 - len, "");
448         }
449         return 0;
450 }
451
452 static const struct seq_operations rt_cache_seq_ops = {
453         .start  = rt_cache_seq_start,
454         .next   = rt_cache_seq_next,
455         .stop   = rt_cache_seq_stop,
456         .show   = rt_cache_seq_show,
457 };
458
459 static int rt_cache_seq_open(struct inode *inode, struct file *file)
460 {
461         return seq_open_net(inode, file, &rt_cache_seq_ops,
462                         sizeof(struct rt_cache_iter_state));
463 }
464
465 static const struct file_operations rt_cache_seq_fops = {
466         .owner   = THIS_MODULE,
467         .open    = rt_cache_seq_open,
468         .read    = seq_read,
469         .llseek  = seq_lseek,
470         .release = seq_release_net,
471 };
472
473
474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 {
476         int cpu;
477
478         if (*pos == 0)
479                 return SEQ_START_TOKEN;
480
481         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
482                 if (!cpu_possible(cpu))
483                         continue;
484                 *pos = cpu+1;
485                 return &per_cpu(rt_cache_stat, cpu);
486         }
487         return NULL;
488 }
489
490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491 {
492         int cpu;
493
494         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
495                 if (!cpu_possible(cpu))
496                         continue;
497                 *pos = cpu+1;
498                 return &per_cpu(rt_cache_stat, cpu);
499         }
500         return NULL;
501
502 }
503
504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 {
506
507 }
508
509 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510 {
511         struct rt_cache_stat *st = v;
512
513         if (v == SEQ_START_TOKEN) {
514                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
515                 return 0;
516         }
517
518         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
519                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
520                    dst_entries_get_slow(&ipv4_dst_ops),
521                    st->in_hit,
522                    st->in_slow_tot,
523                    st->in_slow_mc,
524                    st->in_no_route,
525                    st->in_brd,
526                    st->in_martian_dst,
527                    st->in_martian_src,
528
529                    st->out_hit,
530                    st->out_slow_tot,
531                    st->out_slow_mc,
532
533                    st->gc_total,
534                    st->gc_ignored,
535                    st->gc_goal_miss,
536                    st->gc_dst_overflow,
537                    st->in_hlist_search,
538                    st->out_hlist_search
539                 );
540         return 0;
541 }
542
543 static const struct seq_operations rt_cpu_seq_ops = {
544         .start  = rt_cpu_seq_start,
545         .next   = rt_cpu_seq_next,
546         .stop   = rt_cpu_seq_stop,
547         .show   = rt_cpu_seq_show,
548 };
549
550
551 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552 {
553         return seq_open(file, &rt_cpu_seq_ops);
554 }
555
556 static const struct file_operations rt_cpu_seq_fops = {
557         .owner   = THIS_MODULE,
558         .open    = rt_cpu_seq_open,
559         .read    = seq_read,
560         .llseek  = seq_lseek,
561         .release = seq_release,
562 };
563
564 #ifdef CONFIG_IP_ROUTE_CLASSID
565 static int rt_acct_proc_show(struct seq_file *m, void *v)
566 {
567         struct ip_rt_acct *dst, *src;
568         unsigned int i, j;
569
570         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571         if (!dst)
572                 return -ENOMEM;
573
574         for_each_possible_cpu(i) {
575                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576                 for (j = 0; j < 256; j++) {
577                         dst[j].o_bytes   += src[j].o_bytes;
578                         dst[j].o_packets += src[j].o_packets;
579                         dst[j].i_bytes   += src[j].i_bytes;
580                         dst[j].i_packets += src[j].i_packets;
581                 }
582         }
583
584         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585         kfree(dst);
586         return 0;
587 }
588
589 static int rt_acct_proc_open(struct inode *inode, struct file *file)
590 {
591         return single_open(file, rt_acct_proc_show, NULL);
592 }
593
594 static const struct file_operations rt_acct_proc_fops = {
595         .owner          = THIS_MODULE,
596         .open           = rt_acct_proc_open,
597         .read           = seq_read,
598         .llseek         = seq_lseek,
599         .release        = single_release,
600 };
601 #endif
602
603 static int __net_init ip_rt_do_proc_init(struct net *net)
604 {
605         struct proc_dir_entry *pde;
606
607         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608                         &rt_cache_seq_fops);
609         if (!pde)
610                 goto err1;
611
612         pde = proc_create("rt_cache", S_IRUGO,
613                           net->proc_net_stat, &rt_cpu_seq_fops);
614         if (!pde)
615                 goto err2;
616
617 #ifdef CONFIG_IP_ROUTE_CLASSID
618         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
619         if (!pde)
620                 goto err3;
621 #endif
622         return 0;
623
624 #ifdef CONFIG_IP_ROUTE_CLASSID
625 err3:
626         remove_proc_entry("rt_cache", net->proc_net_stat);
627 #endif
628 err2:
629         remove_proc_entry("rt_cache", net->proc_net);
630 err1:
631         return -ENOMEM;
632 }
633
634 static void __net_exit ip_rt_do_proc_exit(struct net *net)
635 {
636         remove_proc_entry("rt_cache", net->proc_net_stat);
637         remove_proc_entry("rt_cache", net->proc_net);
638 #ifdef CONFIG_IP_ROUTE_CLASSID
639         remove_proc_entry("rt_acct", net->proc_net);
640 #endif
641 }
642
643 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
644         .init = ip_rt_do_proc_init,
645         .exit = ip_rt_do_proc_exit,
646 };
647
648 static int __init ip_rt_proc_init(void)
649 {
650         return register_pernet_subsys(&ip_rt_proc_ops);
651 }
652
653 #else
654 static inline int ip_rt_proc_init(void)
655 {
656         return 0;
657 }
658 #endif /* CONFIG_PROC_FS */
659
660 static inline void rt_free(struct rtable *rt)
661 {
662         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
663 }
664
665 static inline void rt_drop(struct rtable *rt)
666 {
667         ip_rt_put(rt);
668         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
669 }
670
671 static inline int rt_fast_clean(struct rtable *rth)
672 {
673         /* Kill broadcast/multicast entries very aggresively, if they
674            collide in hash table with more useful entries */
675         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
676                 rt_is_input_route(rth) && rth->dst.rt_next;
677 }
678
679 static inline int rt_valuable(struct rtable *rth)
680 {
681         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
682                 (rth->peer && rth->peer->pmtu_expires);
683 }
684
685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 {
687         unsigned long age;
688         int ret = 0;
689
690         if (atomic_read(&rth->dst.__refcnt))
691                 goto out;
692
693         age = jiffies - rth->dst.lastuse;
694         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695             (age <= tmo2 && rt_valuable(rth)))
696                 goto out;
697         ret = 1;
698 out:    return ret;
699 }
700
701 /* Bits of score are:
702  * 31: very valuable
703  * 30: not quite useless
704  * 29..0: usage counter
705  */
706 static inline u32 rt_score(struct rtable *rt)
707 {
708         u32 score = jiffies - rt->dst.lastuse;
709
710         score = ~score & ~(3<<30);
711
712         if (rt_valuable(rt))
713                 score |= (1<<31);
714
715         if (rt_is_output_route(rt) ||
716             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717                 score |= (1<<30);
718
719         return score;
720 }
721
722 static inline bool rt_caching(const struct net *net)
723 {
724         return net->ipv4.current_rt_cache_rebuild_count <=
725                 net->ipv4.sysctl_rt_cache_rebuild_count;
726 }
727
728 static inline bool compare_hash_inputs(const struct rtable *rt1,
729                                        const struct rtable *rt2)
730 {
731         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
733                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
734 }
735
736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
737 {
738         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740                 (rt1->rt_mark ^ rt2->rt_mark) |
741                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
742                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
743                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
744 }
745
746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747 {
748         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
749 }
750
751 static inline int rt_is_expired(struct rtable *rth)
752 {
753         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
754 }
755
756 /*
757  * Perform a full scan of hash table and free all entries.
758  * Can be called by a softirq or a process.
759  * In the later case, we want to be reschedule if necessary
760  */
761 static void rt_do_flush(struct net *net, int process_context)
762 {
763         unsigned int i;
764         struct rtable *rth, *next;
765
766         for (i = 0; i <= rt_hash_mask; i++) {
767                 struct rtable __rcu **pprev;
768                 struct rtable *list;
769
770                 if (process_context && need_resched())
771                         cond_resched();
772                 rth = rcu_access_pointer(rt_hash_table[i].chain);
773                 if (!rth)
774                         continue;
775
776                 spin_lock_bh(rt_hash_lock_addr(i));
777
778                 list = NULL;
779                 pprev = &rt_hash_table[i].chain;
780                 rth = rcu_dereference_protected(*pprev,
781                         lockdep_is_held(rt_hash_lock_addr(i)));
782
783                 while (rth) {
784                         next = rcu_dereference_protected(rth->dst.rt_next,
785                                 lockdep_is_held(rt_hash_lock_addr(i)));
786
787                         if (!net ||
788                             net_eq(dev_net(rth->dst.dev), net)) {
789                                 rcu_assign_pointer(*pprev, next);
790                                 rcu_assign_pointer(rth->dst.rt_next, list);
791                                 list = rth;
792                         } else {
793                                 pprev = &rth->dst.rt_next;
794                         }
795                         rth = next;
796                 }
797
798                 spin_unlock_bh(rt_hash_lock_addr(i));
799
800                 for (; list; list = next) {
801                         next = rcu_dereference_protected(list->dst.rt_next, 1);
802                         rt_free(list);
803                 }
804         }
805 }
806
807 /*
808  * While freeing expired entries, we compute average chain length
809  * and standard deviation, using fixed-point arithmetic.
810  * This to have an estimation of rt_chain_length_max
811  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
812  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813  */
814
815 #define FRACT_BITS 3
816 #define ONE (1UL << FRACT_BITS)
817
818 /*
819  * Given a hash chain and an item in this hash chain,
820  * find if a previous entry has the same hash_inputs
821  * (but differs on tos, mark or oif)
822  * Returns 0 if an alias is found.
823  * Returns ONE if rth has no alias before itself.
824  */
825 static int has_noalias(const struct rtable *head, const struct rtable *rth)
826 {
827         const struct rtable *aux = head;
828
829         while (aux != rth) {
830                 if (compare_hash_inputs(aux, rth))
831                         return 0;
832                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
833         }
834         return ONE;
835 }
836
837 static void rt_check_expire(void)
838 {
839         static unsigned int rover;
840         unsigned int i = rover, goal;
841         struct rtable *rth;
842         struct rtable __rcu **rthp;
843         unsigned long samples = 0;
844         unsigned long sum = 0, sum2 = 0;
845         unsigned long delta;
846         u64 mult;
847
848         delta = jiffies - expires_ljiffies;
849         expires_ljiffies = jiffies;
850         mult = ((u64)delta) << rt_hash_log;
851         if (ip_rt_gc_timeout > 1)
852                 do_div(mult, ip_rt_gc_timeout);
853         goal = (unsigned int)mult;
854         if (goal > rt_hash_mask)
855                 goal = rt_hash_mask + 1;
856         for (; goal > 0; goal--) {
857                 unsigned long tmo = ip_rt_gc_timeout;
858                 unsigned long length;
859
860                 i = (i + 1) & rt_hash_mask;
861                 rthp = &rt_hash_table[i].chain;
862
863                 if (need_resched())
864                         cond_resched();
865
866                 samples++;
867
868                 if (rcu_dereference_raw(*rthp) == NULL)
869                         continue;
870                 length = 0;
871                 spin_lock_bh(rt_hash_lock_addr(i));
872                 while ((rth = rcu_dereference_protected(*rthp,
873                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874                         prefetch(rth->dst.rt_next);
875                         if (rt_is_expired(rth)) {
876                                 *rthp = rth->dst.rt_next;
877                                 rt_free(rth);
878                                 continue;
879                         }
880                         if (rth->dst.expires) {
881                                 /* Entry is expired even if it is in use */
882                                 if (time_before_eq(jiffies, rth->dst.expires)) {
883 nofree:
884                                         tmo >>= 1;
885                                         rthp = &rth->dst.rt_next;
886                                         /*
887                                          * We only count entries on
888                                          * a chain with equal hash inputs once
889                                          * so that entries for different QOS
890                                          * levels, and other non-hash input
891                                          * attributes don't unfairly skew
892                                          * the length computation
893                                          */
894                                         length += has_noalias(rt_hash_table[i].chain, rth);
895                                         continue;
896                                 }
897                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898                                 goto nofree;
899
900                         /* Cleanup aged off entries. */
901                         *rthp = rth->dst.rt_next;
902                         rt_free(rth);
903                 }
904                 spin_unlock_bh(rt_hash_lock_addr(i));
905                 sum += length;
906                 sum2 += length*length;
907         }
908         if (samples) {
909                 unsigned long avg = sum / samples;
910                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911                 rt_chain_length_max = max_t(unsigned long,
912                                         ip_rt_gc_elasticity,
913                                         (avg + 4*sd) >> FRACT_BITS);
914         }
915         rover = i;
916 }
917
918 /*
919  * rt_worker_func() is run in process context.
920  * we call rt_check_expire() to scan part of the hash table
921  */
922 static void rt_worker_func(struct work_struct *work)
923 {
924         rt_check_expire();
925         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926 }
927
928 /*
929  * Perturbation of rt_genid by a small quantity [1..256]
930  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931  * many times (2^24) without giving recent rt_genid.
932  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
933  */
934 static void rt_cache_invalidate(struct net *net)
935 {
936         unsigned char shuffle;
937
938         get_random_bytes(&shuffle, sizeof(shuffle));
939         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
940         redirect_genid++;
941 }
942
943 /*
944  * delay < 0  : invalidate cache (fast : entries will be deleted later)
945  * delay >= 0 : invalidate & flush cache (can be long)
946  */
947 void rt_cache_flush(struct net *net, int delay)
948 {
949         rt_cache_invalidate(net);
950         if (delay >= 0)
951                 rt_do_flush(net, !in_softirq());
952 }
953
954 /* Flush previous cache invalidated entries from the cache */
955 void rt_cache_flush_batch(struct net *net)
956 {
957         rt_do_flush(net, !in_softirq());
958 }
959
960 static void rt_emergency_hash_rebuild(struct net *net)
961 {
962         if (net_ratelimit())
963                 printk(KERN_WARNING "Route hash chain too long!\n");
964         rt_cache_invalidate(net);
965 }
966
967 /*
968    Short description of GC goals.
969
970    We want to build algorithm, which will keep routing cache
971    at some equilibrium point, when number of aged off entries
972    is kept approximately equal to newly generated ones.
973
974    Current expiration strength is variable "expire".
975    We try to adjust it dynamically, so that if networking
976    is idle expires is large enough to keep enough of warm entries,
977    and when load increases it reduces to limit cache size.
978  */
979
980 static int rt_garbage_collect(struct dst_ops *ops)
981 {
982         static unsigned long expire = RT_GC_TIMEOUT;
983         static unsigned long last_gc;
984         static int rover;
985         static int equilibrium;
986         struct rtable *rth;
987         struct rtable __rcu **rthp;
988         unsigned long now = jiffies;
989         int goal;
990         int entries = dst_entries_get_fast(&ipv4_dst_ops);
991
992         /*
993          * Garbage collection is pretty expensive,
994          * do not make it too frequently.
995          */
996
997         RT_CACHE_STAT_INC(gc_total);
998
999         if (now - last_gc < ip_rt_gc_min_interval &&
1000             entries < ip_rt_max_size) {
1001                 RT_CACHE_STAT_INC(gc_ignored);
1002                 goto out;
1003         }
1004
1005         entries = dst_entries_get_slow(&ipv4_dst_ops);
1006         /* Calculate number of entries, which we want to expire now. */
1007         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008         if (goal <= 0) {
1009                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010                         equilibrium = ipv4_dst_ops.gc_thresh;
1011                 goal = entries - equilibrium;
1012                 if (goal > 0) {
1013                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014                         goal = entries - equilibrium;
1015                 }
1016         } else {
1017                 /* We are in dangerous area. Try to reduce cache really
1018                  * aggressively.
1019                  */
1020                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021                 equilibrium = entries - goal;
1022         }
1023
1024         if (now - last_gc >= ip_rt_gc_min_interval)
1025                 last_gc = now;
1026
1027         if (goal <= 0) {
1028                 equilibrium += goal;
1029                 goto work_done;
1030         }
1031
1032         do {
1033                 int i, k;
1034
1035                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036                         unsigned long tmo = expire;
1037
1038                         k = (k + 1) & rt_hash_mask;
1039                         rthp = &rt_hash_table[k].chain;
1040                         spin_lock_bh(rt_hash_lock_addr(k));
1041                         while ((rth = rcu_dereference_protected(*rthp,
1042                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043                                 if (!rt_is_expired(rth) &&
1044                                         !rt_may_expire(rth, tmo, expire)) {
1045                                         tmo >>= 1;
1046                                         rthp = &rth->dst.rt_next;
1047                                         continue;
1048                                 }
1049                                 *rthp = rth->dst.rt_next;
1050                                 rt_free(rth);
1051                                 goal--;
1052                         }
1053                         spin_unlock_bh(rt_hash_lock_addr(k));
1054                         if (goal <= 0)
1055                                 break;
1056                 }
1057                 rover = k;
1058
1059                 if (goal <= 0)
1060                         goto work_done;
1061
1062                 /* Goal is not achieved. We stop process if:
1063
1064                    - if expire reduced to zero. Otherwise, expire is halfed.
1065                    - if table is not full.
1066                    - if we are called from interrupt.
1067                    - jiffies check is just fallback/debug loop breaker.
1068                      We will not spin here for long time in any case.
1069                  */
1070
1071                 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073                 if (expire == 0)
1074                         break;
1075
1076                 expire >>= 1;
1077
1078                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079                         goto out;
1080         } while (!in_softirq() && time_before_eq(jiffies, now));
1081
1082         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083                 goto out;
1084         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085                 goto out;
1086         if (net_ratelimit())
1087                 printk(KERN_WARNING "dst cache overflow\n");
1088         RT_CACHE_STAT_INC(gc_dst_overflow);
1089         return 1;
1090
1091 work_done:
1092         expire += ip_rt_gc_min_interval;
1093         if (expire > ip_rt_gc_timeout ||
1094             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1096                 expire = ip_rt_gc_timeout;
1097 out:    return 0;
1098 }
1099
1100 /*
1101  * Returns number of entries in a hash chain that have different hash_inputs
1102  */
1103 static int slow_chain_length(const struct rtable *head)
1104 {
1105         int length = 0;
1106         const struct rtable *rth = head;
1107
1108         while (rth) {
1109                 length += has_noalias(head, rth);
1110                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1111         }
1112         return length >> FRACT_BITS;
1113 }
1114
1115 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1116 {
1117         static const __be32 inaddr_any = 0;
1118         struct net_device *dev = dst->dev;
1119         const __be32 *pkey = daddr;
1120         struct neighbour *n;
1121
1122         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1123                 pkey = &inaddr_any;
1124
1125         n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1126         if (n)
1127                 return n;
1128         return neigh_create(&arp_tbl, pkey, dev);
1129 }
1130
1131 static int rt_bind_neighbour(struct rtable *rt)
1132 {
1133         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1134         if (IS_ERR(n))
1135                 return PTR_ERR(n);
1136         dst_set_neighbour(&rt->dst, n);
1137
1138         return 0;
1139 }
1140
1141 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1142                                      struct sk_buff *skb, int ifindex)
1143 {
1144         struct rtable   *rth, *cand;
1145         struct rtable __rcu **rthp, **candp;
1146         unsigned long   now;
1147         u32             min_score;
1148         int             chain_length;
1149         int attempts = !in_softirq();
1150
1151 restart:
1152         chain_length = 0;
1153         min_score = ~(u32)0;
1154         cand = NULL;
1155         candp = NULL;
1156         now = jiffies;
1157
1158         if (!rt_caching(dev_net(rt->dst.dev))) {
1159                 /*
1160                  * If we're not caching, just tell the caller we
1161                  * were successful and don't touch the route.  The
1162                  * caller hold the sole reference to the cache entry, and
1163                  * it will be released when the caller is done with it.
1164                  * If we drop it here, the callers have no way to resolve routes
1165                  * when we're not caching.  Instead, just point *rp at rt, so
1166                  * the caller gets a single use out of the route
1167                  * Note that we do rt_free on this new route entry, so that
1168                  * once its refcount hits zero, we are still able to reap it
1169                  * (Thanks Alexey)
1170                  * Note: To avoid expensive rcu stuff for this uncached dst,
1171                  * we set DST_NOCACHE so that dst_release() can free dst without
1172                  * waiting a grace period.
1173                  */
1174
1175                 rt->dst.flags |= DST_NOCACHE;
1176                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1177                         int err = rt_bind_neighbour(rt);
1178                         if (err) {
1179                                 if (net_ratelimit())
1180                                         printk(KERN_WARNING
1181                                             "Neighbour table failure & not caching routes.\n");
1182                                 ip_rt_put(rt);
1183                                 return ERR_PTR(err);
1184                         }
1185                 }
1186
1187                 goto skip_hashing;
1188         }
1189
1190         rthp = &rt_hash_table[hash].chain;
1191
1192         spin_lock_bh(rt_hash_lock_addr(hash));
1193         while ((rth = rcu_dereference_protected(*rthp,
1194                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1195                 if (rt_is_expired(rth)) {
1196                         *rthp = rth->dst.rt_next;
1197                         rt_free(rth);
1198                         continue;
1199                 }
1200                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1201                         /* Put it first */
1202                         *rthp = rth->dst.rt_next;
1203                         /*
1204                          * Since lookup is lockfree, the deletion
1205                          * must be visible to another weakly ordered CPU before
1206                          * the insertion at the start of the hash chain.
1207                          */
1208                         rcu_assign_pointer(rth->dst.rt_next,
1209                                            rt_hash_table[hash].chain);
1210                         /*
1211                          * Since lookup is lockfree, the update writes
1212                          * must be ordered for consistency on SMP.
1213                          */
1214                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1215
1216                         dst_use(&rth->dst, now);
1217                         spin_unlock_bh(rt_hash_lock_addr(hash));
1218
1219                         rt_drop(rt);
1220                         if (skb)
1221                                 skb_dst_set(skb, &rth->dst);
1222                         return rth;
1223                 }
1224
1225                 if (!atomic_read(&rth->dst.__refcnt)) {
1226                         u32 score = rt_score(rth);
1227
1228                         if (score <= min_score) {
1229                                 cand = rth;
1230                                 candp = rthp;
1231                                 min_score = score;
1232                         }
1233                 }
1234
1235                 chain_length++;
1236
1237                 rthp = &rth->dst.rt_next;
1238         }
1239
1240         if (cand) {
1241                 /* ip_rt_gc_elasticity used to be average length of chain
1242                  * length, when exceeded gc becomes really aggressive.
1243                  *
1244                  * The second limit is less certain. At the moment it allows
1245                  * only 2 entries per bucket. We will see.
1246                  */
1247                 if (chain_length > ip_rt_gc_elasticity) {
1248                         *candp = cand->dst.rt_next;
1249                         rt_free(cand);
1250                 }
1251         } else {
1252                 if (chain_length > rt_chain_length_max &&
1253                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1254                         struct net *net = dev_net(rt->dst.dev);
1255                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1256                         if (!rt_caching(net)) {
1257                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1258                                         rt->dst.dev->name, num);
1259                         }
1260                         rt_emergency_hash_rebuild(net);
1261                         spin_unlock_bh(rt_hash_lock_addr(hash));
1262
1263                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1264                                         ifindex, rt_genid(net));
1265                         goto restart;
1266                 }
1267         }
1268
1269         /* Try to bind route to arp only if it is output
1270            route or unicast forwarding path.
1271          */
1272         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1273                 int err = rt_bind_neighbour(rt);
1274                 if (err) {
1275                         spin_unlock_bh(rt_hash_lock_addr(hash));
1276
1277                         if (err != -ENOBUFS) {
1278                                 rt_drop(rt);
1279                                 return ERR_PTR(err);
1280                         }
1281
1282                         /* Neighbour tables are full and nothing
1283                            can be released. Try to shrink route cache,
1284                            it is most likely it holds some neighbour records.
1285                          */
1286                         if (attempts-- > 0) {
1287                                 int saved_elasticity = ip_rt_gc_elasticity;
1288                                 int saved_int = ip_rt_gc_min_interval;
1289                                 ip_rt_gc_elasticity     = 1;
1290                                 ip_rt_gc_min_interval   = 0;
1291                                 rt_garbage_collect(&ipv4_dst_ops);
1292                                 ip_rt_gc_min_interval   = saved_int;
1293                                 ip_rt_gc_elasticity     = saved_elasticity;
1294                                 goto restart;
1295                         }
1296
1297                         if (net_ratelimit())
1298                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1299                         rt_drop(rt);
1300                         return ERR_PTR(-ENOBUFS);
1301                 }
1302         }
1303
1304         rt->dst.rt_next = rt_hash_table[hash].chain;
1305
1306         /*
1307          * Since lookup is lockfree, we must make sure
1308          * previous writes to rt are committed to memory
1309          * before making rt visible to other CPUS.
1310          */
1311         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1312
1313         spin_unlock_bh(rt_hash_lock_addr(hash));
1314
1315 skip_hashing:
1316         if (skb)
1317                 skb_dst_set(skb, &rt->dst);
1318         return rt;
1319 }
1320
1321 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1322
1323 static u32 rt_peer_genid(void)
1324 {
1325         return atomic_read(&__rt_peer_genid);
1326 }
1327
1328 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1329 {
1330         struct inet_peer *peer;
1331
1332         peer = inet_getpeer_v4(daddr, create);
1333
1334         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1335                 inet_putpeer(peer);
1336         else
1337                 rt->rt_peer_genid = rt_peer_genid();
1338 }
1339
1340 /*
1341  * Peer allocation may fail only in serious out-of-memory conditions.  However
1342  * we still can generate some output.
1343  * Random ID selection looks a bit dangerous because we have no chances to
1344  * select ID being unique in a reasonable period of time.
1345  * But broken packet identifier may be better than no packet at all.
1346  */
1347 static void ip_select_fb_ident(struct iphdr *iph)
1348 {
1349         static DEFINE_SPINLOCK(ip_fb_id_lock);
1350         static u32 ip_fallback_id;
1351         u32 salt;
1352
1353         spin_lock_bh(&ip_fb_id_lock);
1354         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1355         iph->id = htons(salt & 0xFFFF);
1356         ip_fallback_id = salt;
1357         spin_unlock_bh(&ip_fb_id_lock);
1358 }
1359
1360 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1361 {
1362         struct rtable *rt = (struct rtable *) dst;
1363
1364         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1365                 if (rt->peer == NULL)
1366                         rt_bind_peer(rt, rt->rt_dst, 1);
1367
1368                 /* If peer is attached to destination, it is never detached,
1369                    so that we need not to grab a lock to dereference it.
1370                  */
1371                 if (rt->peer) {
1372                         iph->id = htons(inet_getid(rt->peer, more));
1373                         return;
1374                 }
1375         } else if (!rt)
1376                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1377                        __builtin_return_address(0));
1378
1379         ip_select_fb_ident(iph);
1380 }
1381 EXPORT_SYMBOL(__ip_select_ident);
1382
1383 static void rt_del(unsigned hash, struct rtable *rt)
1384 {
1385         struct rtable __rcu **rthp;
1386         struct rtable *aux;
1387
1388         rthp = &rt_hash_table[hash].chain;
1389         spin_lock_bh(rt_hash_lock_addr(hash));
1390         ip_rt_put(rt);
1391         while ((aux = rcu_dereference_protected(*rthp,
1392                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1393                 if (aux == rt || rt_is_expired(aux)) {
1394                         *rthp = aux->dst.rt_next;
1395                         rt_free(aux);
1396                         continue;
1397                 }
1398                 rthp = &aux->dst.rt_next;
1399         }
1400         spin_unlock_bh(rt_hash_lock_addr(hash));
1401 }
1402
1403 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1404 {
1405         struct rtable *rt = (struct rtable *) dst;
1406         __be32 orig_gw = rt->rt_gateway;
1407         struct neighbour *n, *old_n;
1408
1409         dst_confirm(&rt->dst);
1410
1411         rt->rt_gateway = peer->redirect_learned.a4;
1412
1413         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1414         if (IS_ERR(n)) {
1415                 rt->rt_gateway = orig_gw;
1416                 return;
1417         }
1418         old_n = xchg(&rt->dst._neighbour, n);
1419         if (old_n)
1420                 neigh_release(old_n);
1421         if (!(n->nud_state & NUD_VALID)) {
1422                 neigh_event_send(n, NULL);
1423         } else {
1424                 rt->rt_flags |= RTCF_REDIRECTED;
1425                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1426         }
1427 }
1428
1429 /* called in rcu_read_lock() section */
1430 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1431                     __be32 saddr, struct net_device *dev)
1432 {
1433         int s, i;
1434         struct in_device *in_dev = __in_dev_get_rcu(dev);
1435         __be32 skeys[2] = { saddr, 0 };
1436         int    ikeys[2] = { dev->ifindex, 0 };
1437         struct inet_peer *peer;
1438         struct net *net;
1439
1440         if (!in_dev)
1441                 return;
1442
1443         net = dev_net(dev);
1444         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1445             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1446             ipv4_is_zeronet(new_gw))
1447                 goto reject_redirect;
1448
1449         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1450                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1451                         goto reject_redirect;
1452                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1453                         goto reject_redirect;
1454         } else {
1455                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1456                         goto reject_redirect;
1457         }
1458
1459         for (s = 0; s < 2; s++) {
1460                 for (i = 0; i < 2; i++) {
1461                         unsigned int hash;
1462                         struct rtable __rcu **rthp;
1463                         struct rtable *rt;
1464
1465                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1466
1467                         rthp = &rt_hash_table[hash].chain;
1468
1469                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1470                                 rthp = &rt->dst.rt_next;
1471
1472                                 if (rt->rt_key_dst != daddr ||
1473                                     rt->rt_key_src != skeys[s] ||
1474                                     rt->rt_oif != ikeys[i] ||
1475                                     rt_is_input_route(rt) ||
1476                                     rt_is_expired(rt) ||
1477                                     !net_eq(dev_net(rt->dst.dev), net) ||
1478                                     rt->dst.error ||
1479                                     rt->dst.dev != dev ||
1480                                     rt->rt_gateway != old_gw)
1481                                         continue;
1482
1483                                 if (!rt->peer)
1484                                         rt_bind_peer(rt, rt->rt_dst, 1);
1485
1486                                 peer = rt->peer;
1487                                 if (peer) {
1488                                         if (peer->redirect_learned.a4 != new_gw ||
1489                                             peer->redirect_genid != redirect_genid) {
1490                                                 peer->redirect_learned.a4 = new_gw;
1491                                                 peer->redirect_genid = redirect_genid;
1492                                                 atomic_inc(&__rt_peer_genid);
1493                                         }
1494                                         check_peer_redir(&rt->dst, peer);
1495                                 }
1496                         }
1497                 }
1498         }
1499         return;
1500
1501 reject_redirect:
1502 #ifdef CONFIG_IP_ROUTE_VERBOSE
1503         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1504                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1505                         "  Advised path = %pI4 -> %pI4\n",
1506                        &old_gw, dev->name, &new_gw,
1507                        &saddr, &daddr);
1508 #endif
1509         ;
1510 }
1511
1512 static bool peer_pmtu_expired(struct inet_peer *peer)
1513 {
1514         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1515
1516         return orig &&
1517                time_after_eq(jiffies, orig) &&
1518                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1519 }
1520
1521 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1522 {
1523         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1524
1525         return orig &&
1526                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1527 }
1528
1529 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1530 {
1531         struct rtable *rt = (struct rtable *)dst;
1532         struct dst_entry *ret = dst;
1533
1534         if (rt) {
1535                 if (dst->obsolete > 0) {
1536                         ip_rt_put(rt);
1537                         ret = NULL;
1538                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1539                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1540                                                 rt->rt_oif,
1541                                                 rt_genid(dev_net(dst->dev)));
1542                         rt_del(hash, rt);
1543                         ret = NULL;
1544                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1545                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1546                 }
1547         }
1548         return ret;
1549 }
1550
1551 /*
1552  * Algorithm:
1553  *      1. The first ip_rt_redirect_number redirects are sent
1554  *         with exponential backoff, then we stop sending them at all,
1555  *         assuming that the host ignores our redirects.
1556  *      2. If we did not see packets requiring redirects
1557  *         during ip_rt_redirect_silence, we assume that the host
1558  *         forgot redirected route and start to send redirects again.
1559  *
1560  * This algorithm is much cheaper and more intelligent than dumb load limiting
1561  * in icmp.c.
1562  *
1563  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1564  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1565  */
1566
1567 void ip_rt_send_redirect(struct sk_buff *skb)
1568 {
1569         struct rtable *rt = skb_rtable(skb);
1570         struct in_device *in_dev;
1571         struct inet_peer *peer;
1572         int log_martians;
1573
1574         rcu_read_lock();
1575         in_dev = __in_dev_get_rcu(rt->dst.dev);
1576         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1577                 rcu_read_unlock();
1578                 return;
1579         }
1580         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1581         rcu_read_unlock();
1582
1583         if (!rt->peer)
1584                 rt_bind_peer(rt, rt->rt_dst, 1);
1585         peer = rt->peer;
1586         if (!peer) {
1587                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1588                 return;
1589         }
1590
1591         /* No redirected packets during ip_rt_redirect_silence;
1592          * reset the algorithm.
1593          */
1594         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1595                 peer->rate_tokens = 0;
1596
1597         /* Too many ignored redirects; do not send anything
1598          * set dst.rate_last to the last seen redirected packet.
1599          */
1600         if (peer->rate_tokens >= ip_rt_redirect_number) {
1601                 peer->rate_last = jiffies;
1602                 return;
1603         }
1604
1605         /* Check for load limit; set rate_last to the latest sent
1606          * redirect.
1607          */
1608         if (peer->rate_tokens == 0 ||
1609             time_after(jiffies,
1610                        (peer->rate_last +
1611                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1612                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1613                 peer->rate_last = jiffies;
1614                 ++peer->rate_tokens;
1615 #ifdef CONFIG_IP_ROUTE_VERBOSE
1616                 if (log_martians &&
1617                     peer->rate_tokens == ip_rt_redirect_number &&
1618                     net_ratelimit())
1619                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1620                                &ip_hdr(skb)->saddr, rt->rt_iif,
1621                                 &rt->rt_dst, &rt->rt_gateway);
1622 #endif
1623         }
1624 }
1625
1626 static int ip_error(struct sk_buff *skb)
1627 {
1628         struct rtable *rt = skb_rtable(skb);
1629         struct inet_peer *peer;
1630         unsigned long now;
1631         bool send;
1632         int code;
1633
1634         switch (rt->dst.error) {
1635         case EINVAL:
1636         default:
1637                 goto out;
1638         case EHOSTUNREACH:
1639                 code = ICMP_HOST_UNREACH;
1640                 break;
1641         case ENETUNREACH:
1642                 code = ICMP_NET_UNREACH;
1643                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1644                                 IPSTATS_MIB_INNOROUTES);
1645                 break;
1646         case EACCES:
1647                 code = ICMP_PKT_FILTERED;
1648                 break;
1649         }
1650
1651         if (!rt->peer)
1652                 rt_bind_peer(rt, rt->rt_dst, 1);
1653         peer = rt->peer;
1654
1655         send = true;
1656         if (peer) {
1657                 now = jiffies;
1658                 peer->rate_tokens += now - peer->rate_last;
1659                 if (peer->rate_tokens > ip_rt_error_burst)
1660                         peer->rate_tokens = ip_rt_error_burst;
1661                 peer->rate_last = now;
1662                 if (peer->rate_tokens >= ip_rt_error_cost)
1663                         peer->rate_tokens -= ip_rt_error_cost;
1664                 else
1665                         send = false;
1666         }
1667         if (send)
1668                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1669
1670 out:    kfree_skb(skb);
1671         return 0;
1672 }
1673
1674 /*
1675  *      The last two values are not from the RFC but
1676  *      are needed for AMPRnet AX.25 paths.
1677  */
1678
1679 static const unsigned short mtu_plateau[] =
1680 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1681
1682 static inline unsigned short guess_mtu(unsigned short old_mtu)
1683 {
1684         int i;
1685
1686         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1687                 if (old_mtu > mtu_plateau[i])
1688                         return mtu_plateau[i];
1689         return 68;
1690 }
1691
1692 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1693                                  unsigned short new_mtu,
1694                                  struct net_device *dev)
1695 {
1696         unsigned short old_mtu = ntohs(iph->tot_len);
1697         unsigned short est_mtu = 0;
1698         struct inet_peer *peer;
1699
1700         peer = inet_getpeer_v4(iph->daddr, 1);
1701         if (peer) {
1702                 unsigned short mtu = new_mtu;
1703
1704                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1705                         /* BSD 4.2 derived systems incorrectly adjust
1706                          * tot_len by the IP header length, and report
1707                          * a zero MTU in the ICMP message.
1708                          */
1709                         if (mtu == 0 &&
1710                             old_mtu >= 68 + (iph->ihl << 2))
1711                                 old_mtu -= iph->ihl << 2;
1712                         mtu = guess_mtu(old_mtu);
1713                 }
1714
1715                 if (mtu < ip_rt_min_pmtu)
1716                         mtu = ip_rt_min_pmtu;
1717                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1718                         unsigned long pmtu_expires;
1719
1720                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1721                         if (!pmtu_expires)
1722                                 pmtu_expires = 1UL;
1723
1724                         est_mtu = mtu;
1725                         peer->pmtu_learned = mtu;
1726                         peer->pmtu_expires = pmtu_expires;
1727                         atomic_inc(&__rt_peer_genid);
1728                 }
1729
1730                 inet_putpeer(peer);
1731         }
1732         return est_mtu ? : new_mtu;
1733 }
1734
1735 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1736 {
1737         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1738
1739         if (!expires)
1740                 return;
1741         if (time_before(jiffies, expires)) {
1742                 u32 orig_dst_mtu = dst_mtu(dst);
1743                 if (peer->pmtu_learned < orig_dst_mtu) {
1744                         if (!peer->pmtu_orig)
1745                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1746                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1747                 }
1748         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1749                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1750 }
1751
1752 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1753 {
1754         struct rtable *rt = (struct rtable *) dst;
1755         struct inet_peer *peer;
1756
1757         dst_confirm(dst);
1758
1759         if (!rt->peer)
1760                 rt_bind_peer(rt, rt->rt_dst, 1);
1761         peer = rt->peer;
1762         if (peer) {
1763                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1764
1765                 if (mtu < ip_rt_min_pmtu)
1766                         mtu = ip_rt_min_pmtu;
1767                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1768
1769                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1770                         if (!pmtu_expires)
1771                                 pmtu_expires = 1UL;
1772
1773                         peer->pmtu_learned = mtu;
1774                         peer->pmtu_expires = pmtu_expires;
1775
1776                         atomic_inc(&__rt_peer_genid);
1777                         rt->rt_peer_genid = rt_peer_genid();
1778                 }
1779                 check_peer_pmtu(dst, peer);
1780         }
1781 }
1782
1783
1784 static void ipv4_validate_peer(struct rtable *rt)
1785 {
1786         if (rt->rt_peer_genid != rt_peer_genid()) {
1787                 struct inet_peer *peer;
1788
1789                 if (!rt->peer)
1790                         rt_bind_peer(rt, rt->rt_dst, 0);
1791
1792                 peer = rt->peer;
1793                 if (peer) {
1794                         check_peer_pmtu(&rt->dst, peer);
1795
1796                         if (peer->redirect_genid != redirect_genid)
1797                                 peer->redirect_learned.a4 = 0;
1798                         if (peer->redirect_learned.a4 &&
1799                             peer->redirect_learned.a4 != rt->rt_gateway)
1800                                 check_peer_redir(&rt->dst, peer);
1801                 }
1802
1803                 rt->rt_peer_genid = rt_peer_genid();
1804         }
1805 }
1806
1807 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1808 {
1809         struct rtable *rt = (struct rtable *) dst;
1810
1811         if (rt_is_expired(rt))
1812                 return NULL;
1813         ipv4_validate_peer(rt);
1814         return dst;
1815 }
1816
1817 static void ipv4_dst_destroy(struct dst_entry *dst)
1818 {
1819         struct rtable *rt = (struct rtable *) dst;
1820         struct inet_peer *peer = rt->peer;
1821
1822         if (rt->fi) {
1823                 fib_info_put(rt->fi);
1824                 rt->fi = NULL;
1825         }
1826         if (peer) {
1827                 rt->peer = NULL;
1828                 inet_putpeer(peer);
1829         }
1830 }
1831
1832
1833 static void ipv4_link_failure(struct sk_buff *skb)
1834 {
1835         struct rtable *rt;
1836
1837         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1838
1839         rt = skb_rtable(skb);
1840         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1841                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1842 }
1843
1844 static int ip_rt_bug(struct sk_buff *skb)
1845 {
1846         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1847                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1848                 skb->dev ? skb->dev->name : "?");
1849         kfree_skb(skb);
1850         WARN_ON(1);
1851         return 0;
1852 }
1853
1854 /*
1855    We do not cache source address of outgoing interface,
1856    because it is used only by IP RR, TS and SRR options,
1857    so that it out of fast path.
1858
1859    BTW remember: "addr" is allowed to be not aligned
1860    in IP options!
1861  */
1862
1863 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1864 {
1865         __be32 src;
1866
1867         if (rt_is_output_route(rt))
1868                 src = ip_hdr(skb)->saddr;
1869         else {
1870                 struct fib_result res;
1871                 struct flowi4 fl4;
1872                 struct iphdr *iph;
1873
1874                 iph = ip_hdr(skb);
1875
1876                 memset(&fl4, 0, sizeof(fl4));
1877                 fl4.daddr = iph->daddr;
1878                 fl4.saddr = iph->saddr;
1879                 fl4.flowi4_tos = RT_TOS(iph->tos);
1880                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1881                 fl4.flowi4_iif = skb->dev->ifindex;
1882                 fl4.flowi4_mark = skb->mark;
1883
1884                 rcu_read_lock();
1885                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1886                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1887                 else
1888                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1889                                         RT_SCOPE_UNIVERSE);
1890                 rcu_read_unlock();
1891         }
1892         memcpy(addr, &src, 4);
1893 }
1894
1895 #ifdef CONFIG_IP_ROUTE_CLASSID
1896 static void set_class_tag(struct rtable *rt, u32 tag)
1897 {
1898         if (!(rt->dst.tclassid & 0xFFFF))
1899                 rt->dst.tclassid |= tag & 0xFFFF;
1900         if (!(rt->dst.tclassid & 0xFFFF0000))
1901                 rt->dst.tclassid |= tag & 0xFFFF0000;
1902 }
1903 #endif
1904
1905 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1906 {
1907         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1908
1909         if (advmss == 0) {
1910                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1911                                ip_rt_min_advmss);
1912                 if (advmss > 65535 - 40)
1913                         advmss = 65535 - 40;
1914         }
1915         return advmss;
1916 }
1917
1918 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1919 {
1920         const struct rtable *rt = (const struct rtable *) dst;
1921         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1922
1923         if (mtu && rt_is_output_route(rt))
1924                 return mtu;
1925
1926         mtu = dst->dev->mtu;
1927
1928         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1929
1930                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1931                         mtu = 576;
1932         }
1933
1934         if (mtu > IP_MAX_MTU)
1935                 mtu = IP_MAX_MTU;
1936
1937         return mtu;
1938 }
1939
1940 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1941                             struct fib_info *fi)
1942 {
1943         struct inet_peer *peer;
1944         int create = 0;
1945
1946         /* If a peer entry exists for this destination, we must hook
1947          * it up in order to get at cached metrics.
1948          */
1949         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1950                 create = 1;
1951
1952         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1953         if (peer) {
1954                 rt->rt_peer_genid = rt_peer_genid();
1955                 if (inet_metrics_new(peer))
1956                         memcpy(peer->metrics, fi->fib_metrics,
1957                                sizeof(u32) * RTAX_MAX);
1958                 dst_init_metrics(&rt->dst, peer->metrics, false);
1959
1960                 check_peer_pmtu(&rt->dst, peer);
1961                 if (peer->redirect_genid != redirect_genid)
1962                         peer->redirect_learned.a4 = 0;
1963                 if (peer->redirect_learned.a4 &&
1964                     peer->redirect_learned.a4 != rt->rt_gateway) {
1965                         rt->rt_gateway = peer->redirect_learned.a4;
1966                         rt->rt_flags |= RTCF_REDIRECTED;
1967                 }
1968         } else {
1969                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1970                         rt->fi = fi;
1971                         atomic_inc(&fi->fib_clntref);
1972                 }
1973                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1974         }
1975 }
1976
1977 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1978                            const struct fib_result *res,
1979                            struct fib_info *fi, u16 type, u32 itag)
1980 {
1981         struct dst_entry *dst = &rt->dst;
1982
1983         if (fi) {
1984                 if (FIB_RES_GW(*res) &&
1985                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1986                         rt->rt_gateway = FIB_RES_GW(*res);
1987                 rt_init_metrics(rt, fl4, fi);
1988 #ifdef CONFIG_IP_ROUTE_CLASSID
1989                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1990 #endif
1991         }
1992
1993         if (dst_mtu(dst) > IP_MAX_MTU)
1994                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1995         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1996                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1997
1998 #ifdef CONFIG_IP_ROUTE_CLASSID
1999 #ifdef CONFIG_IP_MULTIPLE_TABLES
2000         set_class_tag(rt, fib_rules_tclass(res));
2001 #endif
2002         set_class_tag(rt, itag);
2003 #endif
2004 }
2005
2006 static struct rtable *rt_dst_alloc(struct net_device *dev,
2007                                    bool nopolicy, bool noxfrm)
2008 {
2009         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2010                          DST_HOST |
2011                          (nopolicy ? DST_NOPOLICY : 0) |
2012                          (noxfrm ? DST_NOXFRM : 0));
2013 }
2014
2015 /* called in rcu_read_lock() section */
2016 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2017                                 u8 tos, struct net_device *dev, int our)
2018 {
2019         unsigned int hash;
2020         struct rtable *rth;
2021         __be32 spec_dst;
2022         struct in_device *in_dev = __in_dev_get_rcu(dev);
2023         u32 itag = 0;
2024         int err;
2025
2026         /* Primary sanity checks. */
2027
2028         if (in_dev == NULL)
2029                 return -EINVAL;
2030
2031         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2032             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2033                 goto e_inval;
2034
2035         if (ipv4_is_zeronet(saddr)) {
2036                 if (!ipv4_is_local_multicast(daddr))
2037                         goto e_inval;
2038                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2039         } else {
2040                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2041                                           &itag);
2042                 if (err < 0)
2043                         goto e_err;
2044         }
2045         rth = rt_dst_alloc(init_net.loopback_dev,
2046                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2047         if (!rth)
2048                 goto e_nobufs;
2049
2050 #ifdef CONFIG_IP_ROUTE_CLASSID
2051         rth->dst.tclassid = itag;
2052 #endif
2053         rth->dst.output = ip_rt_bug;
2054
2055         rth->rt_key_dst = daddr;
2056         rth->rt_key_src = saddr;
2057         rth->rt_genid   = rt_genid(dev_net(dev));
2058         rth->rt_flags   = RTCF_MULTICAST;
2059         rth->rt_type    = RTN_MULTICAST;
2060         rth->rt_key_tos = tos;
2061         rth->rt_dst     = daddr;
2062         rth->rt_src     = saddr;
2063         rth->rt_route_iif = dev->ifindex;
2064         rth->rt_iif     = dev->ifindex;
2065         rth->rt_oif     = 0;
2066         rth->rt_mark    = skb->mark;
2067         rth->rt_gateway = daddr;
2068         rth->rt_spec_dst= spec_dst;
2069         rth->rt_peer_genid = 0;
2070         rth->peer = NULL;
2071         rth->fi = NULL;
2072         if (our) {
2073                 rth->dst.input= ip_local_deliver;
2074                 rth->rt_flags |= RTCF_LOCAL;
2075         }
2076
2077 #ifdef CONFIG_IP_MROUTE
2078         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2079                 rth->dst.input = ip_mr_input;
2080 #endif
2081         RT_CACHE_STAT_INC(in_slow_mc);
2082
2083         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2084         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2085         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2086
2087 e_nobufs:
2088         return -ENOBUFS;
2089 e_inval:
2090         return -EINVAL;
2091 e_err:
2092         return err;
2093 }
2094
2095
2096 static void ip_handle_martian_source(struct net_device *dev,
2097                                      struct in_device *in_dev,
2098                                      struct sk_buff *skb,
2099                                      __be32 daddr,
2100                                      __be32 saddr)
2101 {
2102         RT_CACHE_STAT_INC(in_martian_src);
2103 #ifdef CONFIG_IP_ROUTE_VERBOSE
2104         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2105                 /*
2106                  *      RFC1812 recommendation, if source is martian,
2107                  *      the only hint is MAC header.
2108                  */
2109                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2110                         &daddr, &saddr, dev->name);
2111                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2112                         int i;
2113                         const unsigned char *p = skb_mac_header(skb);
2114                         printk(KERN_WARNING "ll header: ");
2115                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2116                                 printk("%02x", *p);
2117                                 if (i < (dev->hard_header_len - 1))
2118                                         printk(":");
2119                         }
2120                         printk("\n");
2121                 }
2122         }
2123 #endif
2124 }
2125
2126 /* called in rcu_read_lock() section */
2127 static int __mkroute_input(struct sk_buff *skb,
2128                            const struct fib_result *res,
2129                            struct in_device *in_dev,
2130                            __be32 daddr, __be32 saddr, u32 tos,
2131                            struct rtable **result)
2132 {
2133         struct rtable *rth;
2134         int err;
2135         struct in_device *out_dev;
2136         unsigned int flags = 0;
2137         __be32 spec_dst;
2138         u32 itag;
2139
2140         /* get a working reference to the output device */
2141         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2142         if (out_dev == NULL) {
2143                 if (net_ratelimit())
2144                         printk(KERN_CRIT "Bug in ip_route_input" \
2145                                "_slow(). Please, report\n");
2146                 return -EINVAL;
2147         }
2148
2149
2150         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2151                                   in_dev->dev, &spec_dst, &itag);
2152         if (err < 0) {
2153                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2154                                          saddr);
2155
2156                 goto cleanup;
2157         }
2158
2159         if (err)
2160                 flags |= RTCF_DIRECTSRC;
2161
2162         if (out_dev == in_dev && err &&
2163             (IN_DEV_SHARED_MEDIA(out_dev) ||
2164              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2165                 flags |= RTCF_DOREDIRECT;
2166
2167         if (skb->protocol != htons(ETH_P_IP)) {
2168                 /* Not IP (i.e. ARP). Do not create route, if it is
2169                  * invalid for proxy arp. DNAT routes are always valid.
2170                  *
2171                  * Proxy arp feature have been extended to allow, ARP
2172                  * replies back to the same interface, to support
2173                  * Private VLAN switch technologies. See arp.c.
2174                  */
2175                 if (out_dev == in_dev &&
2176                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2177                         err = -EINVAL;
2178                         goto cleanup;
2179                 }
2180         }
2181
2182         rth = rt_dst_alloc(out_dev->dev,
2183                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2184                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2185         if (!rth) {
2186                 err = -ENOBUFS;
2187                 goto cleanup;
2188         }
2189
2190         rth->rt_key_dst = daddr;
2191         rth->rt_key_src = saddr;
2192         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2193         rth->rt_flags = flags;
2194         rth->rt_type = res->type;
2195         rth->rt_key_tos = tos;
2196         rth->rt_dst     = daddr;
2197         rth->rt_src     = saddr;
2198         rth->rt_route_iif = in_dev->dev->ifindex;
2199         rth->rt_iif     = in_dev->dev->ifindex;
2200         rth->rt_oif     = 0;
2201         rth->rt_mark    = skb->mark;
2202         rth->rt_gateway = daddr;
2203         rth->rt_spec_dst= spec_dst;
2204         rth->rt_peer_genid = 0;
2205         rth->peer = NULL;
2206         rth->fi = NULL;
2207
2208         rth->dst.input = ip_forward;
2209         rth->dst.output = ip_output;
2210
2211         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2212
2213         *result = rth;
2214         err = 0;
2215  cleanup:
2216         return err;
2217 }
2218
2219 static int ip_mkroute_input(struct sk_buff *skb,
2220                             struct fib_result *res,
2221                             const struct flowi4 *fl4,
2222                             struct in_device *in_dev,
2223                             __be32 daddr, __be32 saddr, u32 tos)
2224 {
2225         struct rtable* rth = NULL;
2226         int err;
2227         unsigned hash;
2228
2229 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2230         if (res->fi && res->fi->fib_nhs > 1)
2231                 fib_select_multipath(res);
2232 #endif
2233
2234         /* create a routing cache entry */
2235         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2236         if (err)
2237                 return err;
2238
2239         /* put it into the cache */
2240         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2241                        rt_genid(dev_net(rth->dst.dev)));
2242         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2243         if (IS_ERR(rth))
2244                 return PTR_ERR(rth);
2245         return 0;
2246 }
2247
2248 /*
2249  *      NOTE. We drop all the packets that has local source
2250  *      addresses, because every properly looped back packet
2251  *      must have correct destination already attached by output routine.
2252  *
2253  *      Such approach solves two big problems:
2254  *      1. Not simplex devices are handled properly.
2255  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2256  *      called with rcu_read_lock()
2257  */
2258
2259 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2260                                u8 tos, struct net_device *dev)
2261 {
2262         struct fib_result res;
2263         struct in_device *in_dev = __in_dev_get_rcu(dev);
2264         struct flowi4   fl4;
2265         unsigned        flags = 0;
2266         u32             itag = 0;
2267         struct rtable * rth;
2268         unsigned        hash;
2269         __be32          spec_dst;
2270         int             err = -EINVAL;
2271         struct net    * net = dev_net(dev);
2272
2273         /* IP on this device is disabled. */
2274
2275         if (!in_dev)
2276                 goto out;
2277
2278         /* Check for the most weird martians, which can be not detected
2279            by fib_lookup.
2280          */
2281
2282         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2283             ipv4_is_loopback(saddr))
2284                 goto martian_source;
2285
2286         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2287                 goto brd_input;
2288
2289         /* Accept zero addresses only to limited broadcast;
2290          * I even do not know to fix it or not. Waiting for complains :-)
2291          */
2292         if (ipv4_is_zeronet(saddr))
2293                 goto martian_source;
2294
2295         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2296                 goto martian_destination;
2297
2298         /*
2299          *      Now we are ready to route packet.
2300          */
2301         fl4.flowi4_oif = 0;
2302         fl4.flowi4_iif = dev->ifindex;
2303         fl4.flowi4_mark = skb->mark;
2304         fl4.flowi4_tos = tos;
2305         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2306         fl4.daddr = daddr;
2307         fl4.saddr = saddr;
2308         err = fib_lookup(net, &fl4, &res);
2309         if (err != 0) {
2310                 if (!IN_DEV_FORWARD(in_dev))
2311                         goto e_hostunreach;
2312                 goto no_route;
2313         }
2314
2315         RT_CACHE_STAT_INC(in_slow_tot);
2316
2317         if (res.type == RTN_BROADCAST)
2318                 goto brd_input;
2319
2320         if (res.type == RTN_LOCAL) {
2321                 err = fib_validate_source(skb, saddr, daddr, tos,
2322                                           net->loopback_dev->ifindex,
2323                                           dev, &spec_dst, &itag);
2324                 if (err < 0)
2325                         goto martian_source_keep_err;
2326                 if (err)
2327                         flags |= RTCF_DIRECTSRC;
2328                 spec_dst = daddr;
2329                 goto local_input;
2330         }
2331
2332         if (!IN_DEV_FORWARD(in_dev))
2333                 goto e_hostunreach;
2334         if (res.type != RTN_UNICAST)
2335                 goto martian_destination;
2336
2337         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2338 out:    return err;
2339
2340 brd_input:
2341         if (skb->protocol != htons(ETH_P_IP))
2342                 goto e_inval;
2343
2344         if (ipv4_is_zeronet(saddr))
2345                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2346         else {
2347                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2348                                           &itag);
2349                 if (err < 0)
2350                         goto martian_source_keep_err;
2351                 if (err)
2352                         flags |= RTCF_DIRECTSRC;
2353         }
2354         flags |= RTCF_BROADCAST;
2355         res.type = RTN_BROADCAST;
2356         RT_CACHE_STAT_INC(in_brd);
2357
2358 local_input:
2359         rth = rt_dst_alloc(net->loopback_dev,
2360                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2361         if (!rth)
2362                 goto e_nobufs;
2363
2364         rth->dst.input= ip_local_deliver;
2365         rth->dst.output= ip_rt_bug;
2366 #ifdef CONFIG_IP_ROUTE_CLASSID
2367         rth->dst.tclassid = itag;
2368 #endif
2369
2370         rth->rt_key_dst = daddr;
2371         rth->rt_key_src = saddr;
2372         rth->rt_genid = rt_genid(net);
2373         rth->rt_flags   = flags|RTCF_LOCAL;
2374         rth->rt_type    = res.type;
2375         rth->rt_key_tos = tos;
2376         rth->rt_dst     = daddr;
2377         rth->rt_src     = saddr;
2378 #ifdef CONFIG_IP_ROUTE_CLASSID
2379         rth->dst.tclassid = itag;
2380 #endif
2381         rth->rt_route_iif = dev->ifindex;
2382         rth->rt_iif     = dev->ifindex;
2383         rth->rt_oif     = 0;
2384         rth->rt_mark    = skb->mark;
2385         rth->rt_gateway = daddr;
2386         rth->rt_spec_dst= spec_dst;
2387         rth->rt_peer_genid = 0;
2388         rth->peer = NULL;
2389         rth->fi = NULL;
2390         if (res.type == RTN_UNREACHABLE) {
2391                 rth->dst.input= ip_error;
2392                 rth->dst.error= -err;
2393                 rth->rt_flags   &= ~RTCF_LOCAL;
2394         }
2395         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2396         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2397         err = 0;
2398         if (IS_ERR(rth))
2399                 err = PTR_ERR(rth);
2400         goto out;
2401
2402 no_route:
2403         RT_CACHE_STAT_INC(in_no_route);
2404         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2405         res.type = RTN_UNREACHABLE;
2406         if (err == -ESRCH)
2407                 err = -ENETUNREACH;
2408         goto local_input;
2409
2410         /*
2411          *      Do not cache martian addresses: they should be logged (RFC1812)
2412          */
2413 martian_destination:
2414         RT_CACHE_STAT_INC(in_martian_dst);
2415 #ifdef CONFIG_IP_ROUTE_VERBOSE
2416         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2417                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2418                         &daddr, &saddr, dev->name);
2419 #endif
2420
2421 e_hostunreach:
2422         err = -EHOSTUNREACH;
2423         goto out;
2424
2425 e_inval:
2426         err = -EINVAL;
2427         goto out;
2428
2429 e_nobufs:
2430         err = -ENOBUFS;
2431         goto out;
2432
2433 martian_source:
2434         err = -EINVAL;
2435 martian_source_keep_err:
2436         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2437         goto out;
2438 }
2439
2440 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2441                            u8 tos, struct net_device *dev, bool noref)
2442 {
2443         struct rtable * rth;
2444         unsigned        hash;
2445         int iif = dev->ifindex;
2446         struct net *net;
2447         int res;
2448
2449         net = dev_net(dev);
2450
2451         rcu_read_lock();
2452
2453         if (!rt_caching(net))
2454                 goto skip_cache;
2455
2456         tos &= IPTOS_RT_MASK;
2457         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2458
2459         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2460              rth = rcu_dereference(rth->dst.rt_next)) {
2461                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2462                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2463                      (rth->rt_route_iif ^ iif) |
2464                      (rth->rt_key_tos ^ tos)) == 0 &&
2465                     rth->rt_mark == skb->mark &&
2466                     net_eq(dev_net(rth->dst.dev), net) &&
2467                     !rt_is_expired(rth)) {
2468                         ipv4_validate_peer(rth);
2469                         if (noref) {
2470                                 dst_use_noref(&rth->dst, jiffies);
2471                                 skb_dst_set_noref(skb, &rth->dst);
2472                         } else {
2473                                 dst_use(&rth->dst, jiffies);
2474                                 skb_dst_set(skb, &rth->dst);
2475                         }
2476                         RT_CACHE_STAT_INC(in_hit);
2477                         rcu_read_unlock();
2478                         return 0;
2479                 }
2480                 RT_CACHE_STAT_INC(in_hlist_search);
2481         }
2482
2483 skip_cache:
2484         /* Multicast recognition logic is moved from route cache to here.
2485            The problem was that too many Ethernet cards have broken/missing
2486            hardware multicast filters :-( As result the host on multicasting
2487            network acquires a lot of useless route cache entries, sort of
2488            SDR messages from all the world. Now we try to get rid of them.
2489            Really, provided software IP multicast filter is organized
2490            reasonably (at least, hashed), it does not result in a slowdown
2491            comparing with route cache reject entries.
2492            Note, that multicast routers are not affected, because
2493            route cache entry is created eventually.
2494          */
2495         if (ipv4_is_multicast(daddr)) {
2496                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2497
2498                 if (in_dev) {
2499                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2500                                                   ip_hdr(skb)->protocol);
2501                         if (our
2502 #ifdef CONFIG_IP_MROUTE
2503                                 ||
2504                             (!ipv4_is_local_multicast(daddr) &&
2505                              IN_DEV_MFORWARD(in_dev))
2506 #endif
2507                            ) {
2508                                 int res = ip_route_input_mc(skb, daddr, saddr,
2509                                                             tos, dev, our);
2510                                 rcu_read_unlock();
2511                                 return res;
2512                         }
2513                 }
2514                 rcu_read_unlock();
2515                 return -EINVAL;
2516         }
2517         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2518         rcu_read_unlock();
2519         return res;
2520 }
2521 EXPORT_SYMBOL(ip_route_input_common);
2522
2523 /* called with rcu_read_lock() */
2524 static struct rtable *__mkroute_output(const struct fib_result *res,
2525                                        const struct flowi4 *fl4,
2526                                        __be32 orig_daddr, __be32 orig_saddr,
2527                                        int orig_oif, __u8 orig_rtos,
2528                                        struct net_device *dev_out,
2529                                        unsigned int flags)
2530 {
2531         struct fib_info *fi = res->fi;
2532         struct in_device *in_dev;
2533         u16 type = res->type;
2534         struct rtable *rth;
2535
2536         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2537                 return ERR_PTR(-EINVAL);
2538
2539         if (ipv4_is_lbcast(fl4->daddr))
2540                 type = RTN_BROADCAST;
2541         else if (ipv4_is_multicast(fl4->daddr))
2542                 type = RTN_MULTICAST;
2543         else if (ipv4_is_zeronet(fl4->daddr))
2544                 return ERR_PTR(-EINVAL);
2545
2546         if (dev_out->flags & IFF_LOOPBACK)
2547                 flags |= RTCF_LOCAL;
2548
2549         in_dev = __in_dev_get_rcu(dev_out);
2550         if (!in_dev)
2551                 return ERR_PTR(-EINVAL);
2552
2553         if (type == RTN_BROADCAST) {
2554                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2555                 fi = NULL;
2556         } else if (type == RTN_MULTICAST) {
2557                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2558                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2559                                      fl4->flowi4_proto))
2560                         flags &= ~RTCF_LOCAL;
2561                 /* If multicast route do not exist use
2562                  * default one, but do not gateway in this case.
2563                  * Yes, it is hack.
2564                  */
2565                 if (fi && res->prefixlen < 4)
2566                         fi = NULL;
2567         }
2568
2569         rth = rt_dst_alloc(dev_out,
2570                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2571                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2572         if (!rth)
2573                 return ERR_PTR(-ENOBUFS);
2574
2575         rth->dst.output = ip_output;
2576
2577         rth->rt_key_dst = orig_daddr;
2578         rth->rt_key_src = orig_saddr;
2579         rth->rt_genid = rt_genid(dev_net(dev_out));
2580         rth->rt_flags   = flags;
2581         rth->rt_type    = type;
2582         rth->rt_key_tos = orig_rtos;
2583         rth->rt_dst     = fl4->daddr;
2584         rth->rt_src     = fl4->saddr;
2585         rth->rt_route_iif = 0;
2586         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2587         rth->rt_oif     = orig_oif;
2588         rth->rt_mark    = fl4->flowi4_mark;
2589         rth->rt_gateway = fl4->daddr;
2590         rth->rt_spec_dst= fl4->saddr;
2591         rth->rt_peer_genid = 0;
2592         rth->peer = NULL;
2593         rth->fi = NULL;
2594
2595         RT_CACHE_STAT_INC(out_slow_tot);
2596
2597         if (flags & RTCF_LOCAL) {
2598                 rth->dst.input = ip_local_deliver;
2599                 rth->rt_spec_dst = fl4->daddr;
2600         }
2601         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2602                 rth->rt_spec_dst = fl4->saddr;
2603                 if (flags & RTCF_LOCAL &&
2604                     !(dev_out->flags & IFF_LOOPBACK)) {
2605                         rth->dst.output = ip_mc_output;
2606                         RT_CACHE_STAT_INC(out_slow_mc);
2607                 }
2608 #ifdef CONFIG_IP_MROUTE
2609                 if (type == RTN_MULTICAST) {
2610                         if (IN_DEV_MFORWARD(in_dev) &&
2611                             !ipv4_is_local_multicast(fl4->daddr)) {
2612                                 rth->dst.input = ip_mr_input;
2613                                 rth->dst.output = ip_mc_output;
2614                         }
2615                 }
2616 #endif
2617         }
2618
2619         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2620
2621         return rth;
2622 }
2623
2624 /*
2625  * Major route resolver routine.
2626  * called with rcu_read_lock();
2627  */
2628
2629 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2630 {
2631         struct net_device *dev_out = NULL;
2632         __u8 tos = RT_FL_TOS(fl4);
2633         unsigned int flags = 0;
2634         struct fib_result res;
2635         struct rtable *rth;
2636         __be32 orig_daddr;
2637         __be32 orig_saddr;
2638         int orig_oif;
2639
2640         res.fi          = NULL;
2641 #ifdef CONFIG_IP_MULTIPLE_TABLES
2642         res.r           = NULL;
2643 #endif
2644
2645         orig_daddr = fl4->daddr;
2646         orig_saddr = fl4->saddr;
2647         orig_oif = fl4->flowi4_oif;
2648
2649         fl4->flowi4_iif = net->loopback_dev->ifindex;
2650         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2651         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2652                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2653
2654         rcu_read_lock();
2655         if (fl4->saddr) {
2656                 rth = ERR_PTR(-EINVAL);
2657                 if (ipv4_is_multicast(fl4->saddr) ||
2658                     ipv4_is_lbcast(fl4->saddr) ||
2659                     ipv4_is_zeronet(fl4->saddr))
2660                         goto out;
2661
2662                 /* I removed check for oif == dev_out->oif here.
2663                    It was wrong for two reasons:
2664                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2665                       is assigned to multiple interfaces.
2666                    2. Moreover, we are allowed to send packets with saddr
2667                       of another iface. --ANK
2668                  */
2669
2670                 if (fl4->flowi4_oif == 0 &&
2671                     (ipv4_is_multicast(fl4->daddr) ||
2672                      ipv4_is_lbcast(fl4->daddr))) {
2673                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2674                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2675                         if (dev_out == NULL)
2676                                 goto out;
2677
2678                         /* Special hack: user can direct multicasts
2679                            and limited broadcast via necessary interface
2680                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2681                            This hack is not just for fun, it allows
2682                            vic,vat and friends to work.
2683                            They bind socket to loopback, set ttl to zero
2684                            and expect that it will work.
2685                            From the viewpoint of routing cache they are broken,
2686                            because we are not allowed to build multicast path
2687                            with loopback source addr (look, routing cache
2688                            cannot know, that ttl is zero, so that packet
2689                            will not leave this host and route is valid).
2690                            Luckily, this hack is good workaround.
2691                          */
2692
2693                         fl4->flowi4_oif = dev_out->ifindex;
2694                         goto make_route;
2695                 }
2696
2697                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2698                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2699                         if (!__ip_dev_find(net, fl4->saddr, false))
2700                                 goto out;
2701                 }
2702         }
2703
2704
2705         if (fl4->flowi4_oif) {
2706                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2707                 rth = ERR_PTR(-ENODEV);
2708                 if (dev_out == NULL)
2709                         goto out;
2710
2711                 /* RACE: Check return value of inet_select_addr instead. */
2712                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2713                         rth = ERR_PTR(-ENETUNREACH);
2714                         goto out;
2715                 }
2716                 if (ipv4_is_local_multicast(fl4->daddr) ||
2717                     ipv4_is_lbcast(fl4->daddr)) {
2718                         if (!fl4->saddr)
2719                                 fl4->saddr = inet_select_addr(dev_out, 0,
2720                                                               RT_SCOPE_LINK);
2721                         goto make_route;
2722                 }
2723                 if (fl4->saddr) {
2724                         if (ipv4_is_multicast(fl4->daddr))
2725                                 fl4->saddr = inet_select_addr(dev_out, 0,
2726                                                               fl4->flowi4_scope);
2727                         else if (!fl4->daddr)
2728                                 fl4->saddr = inet_select_addr(dev_out, 0,
2729                                                               RT_SCOPE_HOST);
2730                 }
2731         }
2732
2733         if (!fl4->daddr) {
2734                 fl4->daddr = fl4->saddr;
2735                 if (!fl4->daddr)
2736                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2737                 dev_out = net->loopback_dev;
2738                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2739                 res.type = RTN_LOCAL;
2740                 flags |= RTCF_LOCAL;
2741                 goto make_route;
2742         }
2743
2744         if (fib_lookup(net, fl4, &res)) {
2745                 res.fi = NULL;
2746                 if (fl4->flowi4_oif) {
2747                         /* Apparently, routing tables are wrong. Assume,
2748                            that the destination is on link.
2749
2750                            WHY? DW.
2751                            Because we are allowed to send to iface
2752                            even if it has NO routes and NO assigned
2753                            addresses. When oif is specified, routing
2754                            tables are looked up with only one purpose:
2755                            to catch if destination is gatewayed, rather than
2756                            direct. Moreover, if MSG_DONTROUTE is set,
2757                            we send packet, ignoring both routing tables
2758                            and ifaddr state. --ANK
2759
2760
2761                            We could make it even if oif is unknown,
2762                            likely IPv6, but we do not.
2763                          */
2764
2765                         if (fl4->saddr == 0)
2766                                 fl4->saddr = inet_select_addr(dev_out, 0,
2767                                                               RT_SCOPE_LINK);
2768                         res.type = RTN_UNICAST;
2769                         goto make_route;
2770                 }
2771                 rth = ERR_PTR(-ENETUNREACH);
2772                 goto out;
2773         }
2774
2775         if (res.type == RTN_LOCAL) {
2776                 if (!fl4->saddr) {
2777                         if (res.fi->fib_prefsrc)
2778                                 fl4->saddr = res.fi->fib_prefsrc;
2779                         else
2780                                 fl4->saddr = fl4->daddr;
2781                 }
2782                 dev_out = net->loopback_dev;
2783                 fl4->flowi4_oif = dev_out->ifindex;
2784                 res.fi = NULL;
2785                 flags |= RTCF_LOCAL;
2786                 goto make_route;
2787         }
2788
2789 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2790         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2791                 fib_select_multipath(&res);
2792         else
2793 #endif
2794         if (!res.prefixlen &&
2795             res.table->tb_num_default > 1 &&
2796             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2797                 fib_select_default(&res);
2798
2799         if (!fl4->saddr)
2800                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2801
2802         dev_out = FIB_RES_DEV(res);
2803         fl4->flowi4_oif = dev_out->ifindex;
2804
2805
2806 make_route:
2807         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2808                                tos, dev_out, flags);
2809         if (!IS_ERR(rth)) {
2810                 unsigned int hash;
2811
2812                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2813                                rt_genid(dev_net(dev_out)));
2814                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2815         }
2816
2817 out:
2818         rcu_read_unlock();
2819         return rth;
2820 }
2821
2822 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2823 {
2824         struct rtable *rth;
2825         unsigned int hash;
2826
2827         if (!rt_caching(net))
2828                 goto slow_output;
2829
2830         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2831
2832         rcu_read_lock_bh();
2833         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2834                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2835                 if (rth->rt_key_dst == flp4->daddr &&
2836                     rth->rt_key_src == flp4->saddr &&
2837                     rt_is_output_route(rth) &&
2838                     rth->rt_oif == flp4->flowi4_oif &&
2839                     rth->rt_mark == flp4->flowi4_mark &&
2840                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2841                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2842                     net_eq(dev_net(rth->dst.dev), net) &&
2843                     !rt_is_expired(rth)) {
2844                         ipv4_validate_peer(rth);
2845                         dst_use(&rth->dst, jiffies);
2846                         RT_CACHE_STAT_INC(out_hit);
2847                         rcu_read_unlock_bh();
2848                         if (!flp4->saddr)
2849                                 flp4->saddr = rth->rt_src;
2850                         if (!flp4->daddr)
2851                                 flp4->daddr = rth->rt_dst;
2852                         return rth;
2853                 }
2854                 RT_CACHE_STAT_INC(out_hlist_search);
2855         }
2856         rcu_read_unlock_bh();
2857
2858 slow_output:
2859         return ip_route_output_slow(net, flp4);
2860 }
2861 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2862
2863 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2864 {
2865         return NULL;
2866 }
2867
2868 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2869 {
2870         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2871
2872         return mtu ? : dst->dev->mtu;
2873 }
2874
2875 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2876 {
2877 }
2878
2879 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2880                                           unsigned long old)
2881 {
2882         return NULL;
2883 }
2884
2885 static struct dst_ops ipv4_dst_blackhole_ops = {
2886         .family                 =       AF_INET,
2887         .protocol               =       cpu_to_be16(ETH_P_IP),
2888         .destroy                =       ipv4_dst_destroy,
2889         .check                  =       ipv4_blackhole_dst_check,
2890         .mtu                    =       ipv4_blackhole_mtu,
2891         .default_advmss         =       ipv4_default_advmss,
2892         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2893         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2894         .neigh_lookup           =       ipv4_neigh_lookup,
2895 };
2896
2897 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2898 {
2899         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2900         struct rtable *ort = (struct rtable *) dst_orig;
2901
2902         if (rt) {
2903                 struct dst_entry *new = &rt->dst;
2904
2905                 new->__use = 1;
2906                 new->input = dst_discard;
2907                 new->output = dst_discard;
2908                 dst_copy_metrics(new, &ort->dst);
2909
2910                 new->dev = ort->dst.dev;
2911                 if (new->dev)
2912                         dev_hold(new->dev);
2913
2914                 rt->rt_key_dst = ort->rt_key_dst;
2915                 rt->rt_key_src = ort->rt_key_src;
2916                 rt->rt_key_tos = ort->rt_key_tos;
2917                 rt->rt_route_iif = ort->rt_route_iif;
2918                 rt->rt_iif = ort->rt_iif;
2919                 rt->rt_oif = ort->rt_oif;
2920                 rt->rt_mark = ort->rt_mark;
2921
2922                 rt->rt_genid = rt_genid(net);
2923                 rt->rt_flags = ort->rt_flags;
2924                 rt->rt_type = ort->rt_type;
2925                 rt->rt_dst = ort->rt_dst;
2926                 rt->rt_src = ort->rt_src;
2927                 rt->rt_gateway = ort->rt_gateway;
2928                 rt->rt_spec_dst = ort->rt_spec_dst;
2929                 rt->peer = ort->peer;
2930                 if (rt->peer)
2931                         atomic_inc(&rt->peer->refcnt);
2932                 rt->fi = ort->fi;
2933                 if (rt->fi)
2934                         atomic_inc(&rt->fi->fib_clntref);
2935
2936                 dst_free(new);
2937         }
2938
2939         dst_release(dst_orig);
2940
2941         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2942 }
2943
2944 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2945                                     struct sock *sk)
2946 {
2947         struct rtable *rt = __ip_route_output_key(net, flp4);
2948
2949         if (IS_ERR(rt))
2950                 return rt;
2951
2952         if (flp4->flowi4_proto)
2953                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2954                                                    flowi4_to_flowi(flp4),
2955                                                    sk, 0);
2956
2957         return rt;
2958 }
2959 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2960
2961 static int rt_fill_info(struct net *net,
2962                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2963                         int nowait, unsigned int flags)
2964 {
2965         struct rtable *rt = skb_rtable(skb);
2966         struct rtmsg *r;
2967         struct nlmsghdr *nlh;
2968         unsigned long expires = 0;
2969         const struct inet_peer *peer = rt->peer;
2970         u32 id = 0, ts = 0, tsage = 0, error;
2971
2972         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2973         if (nlh == NULL)
2974                 return -EMSGSIZE;
2975
2976         r = nlmsg_data(nlh);
2977         r->rtm_family    = AF_INET;
2978         r->rtm_dst_len  = 32;
2979         r->rtm_src_len  = 0;
2980         r->rtm_tos      = rt->rt_key_tos;
2981         r->rtm_table    = RT_TABLE_MAIN;
2982         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2983         r->rtm_type     = rt->rt_type;
2984         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2985         r->rtm_protocol = RTPROT_UNSPEC;
2986         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2987         if (rt->rt_flags & RTCF_NOTIFY)
2988                 r->rtm_flags |= RTM_F_NOTIFY;
2989
2990         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2991
2992         if (rt->rt_key_src) {
2993                 r->rtm_src_len = 32;
2994                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2995         }
2996         if (rt->dst.dev)
2997                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2998 #ifdef CONFIG_IP_ROUTE_CLASSID
2999         if (rt->dst.tclassid)
3000                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3001 #endif
3002         if (rt_is_input_route(rt))
3003                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3004         else if (rt->rt_src != rt->rt_key_src)
3005                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3006
3007         if (rt->rt_dst != rt->rt_gateway)
3008                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3009
3010         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3011                 goto nla_put_failure;
3012
3013         if (rt->rt_mark)
3014                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3015
3016         error = rt->dst.error;
3017         if (peer) {
3018                 inet_peer_refcheck(rt->peer);
3019                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3020                 if (peer->tcp_ts_stamp) {
3021                         ts = peer->tcp_ts;
3022                         tsage = get_seconds() - peer->tcp_ts_stamp;
3023                 }
3024                 expires = ACCESS_ONCE(peer->pmtu_expires);
3025                 if (expires) {
3026                         if (time_before(jiffies, expires))
3027                                 expires -= jiffies;
3028                         else
3029                                 expires = 0;
3030                 }
3031         }
3032
3033         if (rt_is_input_route(rt)) {
3034 #ifdef CONFIG_IP_MROUTE
3035                 __be32 dst = rt->rt_dst;
3036
3037                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3038                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3039                         int err = ipmr_get_route(net, skb,
3040                                                  rt->rt_src, rt->rt_dst,
3041                                                  r, nowait);
3042                         if (err <= 0) {
3043                                 if (!nowait) {
3044                                         if (err == 0)
3045                                                 return 0;
3046                                         goto nla_put_failure;
3047                                 } else {
3048                                         if (err == -EMSGSIZE)
3049                                                 goto nla_put_failure;
3050                                         error = err;
3051                                 }
3052                         }
3053                 } else
3054 #endif
3055                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3056         }
3057
3058         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3059                                expires, error) < 0)
3060                 goto nla_put_failure;
3061
3062         return nlmsg_end(skb, nlh);
3063
3064 nla_put_failure:
3065         nlmsg_cancel(skb, nlh);
3066         return -EMSGSIZE;
3067 }
3068
3069 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3070 {
3071         struct net *net = sock_net(in_skb->sk);
3072         struct rtmsg *rtm;
3073         struct nlattr *tb[RTA_MAX+1];
3074         struct rtable *rt = NULL;
3075         __be32 dst = 0;
3076         __be32 src = 0;
3077         u32 iif;
3078         int err;
3079         int mark;
3080         struct sk_buff *skb;
3081
3082         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3083         if (err < 0)
3084                 goto errout;
3085
3086         rtm = nlmsg_data(nlh);
3087
3088         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3089         if (skb == NULL) {
3090                 err = -ENOBUFS;
3091                 goto errout;
3092         }
3093
3094         /* Reserve room for dummy headers, this skb can pass
3095            through good chunk of routing engine.
3096          */
3097         skb_reset_mac_header(skb);
3098         skb_reset_network_header(skb);
3099
3100         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3101         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3102         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3103
3104         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3105         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3106         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3107         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3108
3109         if (iif) {
3110                 struct net_device *dev;
3111
3112                 dev = __dev_get_by_index(net, iif);
3113                 if (dev == NULL) {
3114                         err = -ENODEV;
3115                         goto errout_free;
3116                 }
3117
3118                 skb->protocol   = htons(ETH_P_IP);
3119                 skb->dev        = dev;
3120                 skb->mark       = mark;
3121                 local_bh_disable();
3122                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3123                 local_bh_enable();
3124
3125                 rt = skb_rtable(skb);
3126                 if (err == 0 && rt->dst.error)
3127                         err = -rt->dst.error;
3128         } else {
3129                 struct flowi4 fl4 = {
3130                         .daddr = dst,
3131                         .saddr = src,
3132                         .flowi4_tos = rtm->rtm_tos,
3133                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3134                         .flowi4_mark = mark,
3135                 };
3136                 rt = ip_route_output_key(net, &fl4);
3137
3138                 err = 0;
3139                 if (IS_ERR(rt))
3140                         err = PTR_ERR(rt);
3141         }
3142
3143         if (err)
3144                 goto errout_free;
3145
3146         skb_dst_set(skb, &rt->dst);
3147         if (rtm->rtm_flags & RTM_F_NOTIFY)
3148                 rt->rt_flags |= RTCF_NOTIFY;
3149
3150         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3151                            RTM_NEWROUTE, 0, 0);
3152         if (err <= 0)
3153                 goto errout_free;
3154
3155         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3156 errout:
3157         return err;
3158
3159 errout_free:
3160         kfree_skb(skb);
3161         goto errout;
3162 }
3163
3164 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3165 {
3166         struct rtable *rt;
3167         int h, s_h;
3168         int idx, s_idx;
3169         struct net *net;
3170
3171         net = sock_net(skb->sk);
3172
3173         s_h = cb->args[0];
3174         if (s_h < 0)
3175                 s_h = 0;
3176         s_idx = idx = cb->args[1];
3177         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3178                 if (!rt_hash_table[h].chain)
3179                         continue;
3180                 rcu_read_lock_bh();
3181                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3182                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3183                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3184                                 continue;
3185                         if (rt_is_expired(rt))
3186                                 continue;
3187                         skb_dst_set_noref(skb, &rt->dst);
3188                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3189                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3190                                          1, NLM_F_MULTI) <= 0) {
3191                                 skb_dst_drop(skb);
3192                                 rcu_read_unlock_bh();
3193                                 goto done;
3194                         }
3195                         skb_dst_drop(skb);
3196                 }
3197                 rcu_read_unlock_bh();
3198         }
3199
3200 done:
3201         cb->args[0] = h;
3202         cb->args[1] = idx;
3203         return skb->len;
3204 }
3205
3206 void ip_rt_multicast_event(struct in_device *in_dev)
3207 {
3208         rt_cache_flush(dev_net(in_dev->dev), 0);
3209 }
3210
3211 #ifdef CONFIG_SYSCTL
3212 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3213                                         void __user *buffer,
3214                                         size_t *lenp, loff_t *ppos)
3215 {
3216         if (write) {
3217                 int flush_delay;
3218                 ctl_table ctl;
3219                 struct net *net;
3220
3221                 memcpy(&ctl, __ctl, sizeof(ctl));
3222                 ctl.data = &flush_delay;
3223                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3224
3225                 net = (struct net *)__ctl->extra1;
3226                 rt_cache_flush(net, flush_delay);
3227                 return 0;
3228         }
3229
3230         return -EINVAL;
3231 }
3232
3233 static ctl_table ipv4_route_table[] = {
3234         {
3235                 .procname       = "gc_thresh",
3236                 .data           = &ipv4_dst_ops.gc_thresh,
3237                 .maxlen         = sizeof(int),
3238                 .mode           = 0644,
3239                 .proc_handler   = proc_dointvec,
3240         },
3241         {
3242                 .procname       = "max_size",
3243                 .data           = &ip_rt_max_size,
3244                 .maxlen         = sizeof(int),
3245                 .mode           = 0644,
3246                 .proc_handler   = proc_dointvec,
3247         },
3248         {
3249                 /*  Deprecated. Use gc_min_interval_ms */
3250
3251                 .procname       = "gc_min_interval",
3252                 .data           = &ip_rt_gc_min_interval,
3253                 .maxlen         = sizeof(int),
3254                 .mode           = 0644,
3255                 .proc_handler   = proc_dointvec_jiffies,
3256         },
3257         {
3258                 .procname       = "gc_min_interval_ms",
3259                 .data           = &ip_rt_gc_min_interval,
3260                 .maxlen         = sizeof(int),
3261                 .mode           = 0644,
3262                 .proc_handler   = proc_dointvec_ms_jiffies,
3263         },
3264         {
3265                 .procname       = "gc_timeout",
3266                 .data           = &ip_rt_gc_timeout,
3267                 .maxlen         = sizeof(int),
3268                 .mode           = 0644,
3269                 .proc_handler   = proc_dointvec_jiffies,
3270         },
3271         {
3272                 .procname       = "gc_interval",
3273                 .data           = &ip_rt_gc_interval,
3274                 .maxlen         = sizeof(int),
3275                 .mode           = 0644,
3276                 .proc_handler   = proc_dointvec_jiffies,
3277         },
3278         {
3279                 .procname       = "redirect_load",
3280                 .data           = &ip_rt_redirect_load,
3281                 .maxlen         = sizeof(int),
3282                 .mode           = 0644,
3283                 .proc_handler   = proc_dointvec,
3284         },
3285         {
3286                 .procname       = "redirect_number",
3287                 .data           = &ip_rt_redirect_number,
3288                 .maxlen         = sizeof(int),
3289                 .mode           = 0644,
3290                 .proc_handler   = proc_dointvec,
3291         },
3292         {
3293                 .procname       = "redirect_silence",
3294                 .data           = &ip_rt_redirect_silence,
3295                 .maxlen         = sizeof(int),
3296                 .mode           = 0644,
3297                 .proc_handler   = proc_dointvec,
3298         },
3299         {
3300                 .procname       = "error_cost",
3301                 .data           = &ip_rt_error_cost,
3302                 .maxlen         = sizeof(int),
3303                 .mode           = 0644,
3304                 .proc_handler   = proc_dointvec,
3305         },
3306         {
3307                 .procname       = "error_burst",
3308                 .data           = &ip_rt_error_burst,
3309                 .maxlen         = sizeof(int),
3310                 .mode           = 0644,
3311                 .proc_handler   = proc_dointvec,
3312         },
3313         {
3314                 .procname       = "gc_elasticity",
3315                 .data           = &ip_rt_gc_elasticity,
3316                 .maxlen         = sizeof(int),
3317                 .mode           = 0644,
3318                 .proc_handler   = proc_dointvec,
3319         },
3320         {
3321                 .procname       = "mtu_expires",
3322                 .data           = &ip_rt_mtu_expires,
3323                 .maxlen         = sizeof(int),
3324                 .mode           = 0644,
3325                 .proc_handler   = proc_dointvec_jiffies,
3326         },
3327         {
3328                 .procname       = "min_pmtu",
3329                 .data           = &ip_rt_min_pmtu,
3330                 .maxlen         = sizeof(int),
3331                 .mode           = 0644,
3332                 .proc_handler   = proc_dointvec,
3333         },
3334         {
3335                 .procname       = "min_adv_mss",
3336                 .data           = &ip_rt_min_advmss,
3337                 .maxlen         = sizeof(int),
3338                 .mode           = 0644,
3339                 .proc_handler   = proc_dointvec,
3340         },
3341         { }
3342 };
3343
3344 static struct ctl_table empty[1];
3345
3346 static struct ctl_table ipv4_skeleton[] =
3347 {
3348         { .procname = "route", 
3349           .mode = 0555, .child = ipv4_route_table},
3350         { .procname = "neigh", 
3351           .mode = 0555, .child = empty},
3352         { }
3353 };
3354
3355 static __net_initdata struct ctl_path ipv4_path[] = {
3356         { .procname = "net", },
3357         { .procname = "ipv4", },
3358         { },
3359 };
3360
3361 static struct ctl_table ipv4_route_flush_table[] = {
3362         {
3363                 .procname       = "flush",
3364                 .maxlen         = sizeof(int),
3365                 .mode           = 0200,
3366                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3367         },
3368         { },
3369 };
3370
3371 static __net_initdata struct ctl_path ipv4_route_path[] = {
3372         { .procname = "net", },
3373         { .procname = "ipv4", },
3374         { .procname = "route", },
3375         { },
3376 };
3377
3378 static __net_init int sysctl_route_net_init(struct net *net)
3379 {
3380         struct ctl_table *tbl;
3381
3382         tbl = ipv4_route_flush_table;
3383         if (!net_eq(net, &init_net)) {
3384                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3385                 if (tbl == NULL)
3386                         goto err_dup;
3387         }
3388         tbl[0].extra1 = net;
3389
3390         net->ipv4.route_hdr =
3391                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3392         if (net->ipv4.route_hdr == NULL)
3393                 goto err_reg;
3394         return 0;
3395
3396 err_reg:
3397         if (tbl != ipv4_route_flush_table)
3398                 kfree(tbl);
3399 err_dup:
3400         return -ENOMEM;
3401 }
3402
3403 static __net_exit void sysctl_route_net_exit(struct net *net)
3404 {
3405         struct ctl_table *tbl;
3406
3407         tbl = net->ipv4.route_hdr->ctl_table_arg;
3408         unregister_net_sysctl_table(net->ipv4.route_hdr);
3409         BUG_ON(tbl == ipv4_route_flush_table);
3410         kfree(tbl);
3411 }
3412
3413 static __net_initdata struct pernet_operations sysctl_route_ops = {
3414         .init = sysctl_route_net_init,
3415         .exit = sysctl_route_net_exit,
3416 };
3417 #endif
3418
3419 static __net_init int rt_genid_init(struct net *net)
3420 {
3421         get_random_bytes(&net->ipv4.rt_genid,
3422                          sizeof(net->ipv4.rt_genid));
3423         get_random_bytes(&net->ipv4.dev_addr_genid,
3424                          sizeof(net->ipv4.dev_addr_genid));
3425         return 0;
3426 }
3427
3428 static __net_initdata struct pernet_operations rt_genid_ops = {
3429         .init = rt_genid_init,
3430 };
3431
3432
3433 #ifdef CONFIG_IP_ROUTE_CLASSID
3434 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3435 #endif /* CONFIG_IP_ROUTE_CLASSID */
3436
3437 static __initdata unsigned long rhash_entries;
3438 static int __init set_rhash_entries(char *str)
3439 {
3440         if (!str)
3441                 return 0;
3442         rhash_entries = simple_strtoul(str, &str, 0);
3443         return 1;
3444 }
3445 __setup("rhash_entries=", set_rhash_entries);
3446
3447 int __init ip_rt_init(void)
3448 {
3449         int rc = 0;
3450
3451 #ifdef CONFIG_IP_ROUTE_CLASSID
3452         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3453         if (!ip_rt_acct)
3454                 panic("IP: failed to allocate ip_rt_acct\n");
3455 #endif
3456
3457         ipv4_dst_ops.kmem_cachep =
3458                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3459                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3460
3461         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3462
3463         if (dst_entries_init(&ipv4_dst_ops) < 0)
3464                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3465
3466         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3467                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3468
3469         rt_hash_table = (struct rt_hash_bucket *)
3470                 alloc_large_system_hash("IP route cache",
3471                                         sizeof(struct rt_hash_bucket),
3472                                         rhash_entries,
3473                                         (totalram_pages >= 128 * 1024) ?
3474                                         15 : 17,
3475                                         0,
3476                                         &rt_hash_log,
3477                                         &rt_hash_mask,
3478                                         rhash_entries ? 0 : 512 * 1024);
3479         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3480         rt_hash_lock_init();
3481
3482         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3483         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3484
3485         devinet_init();
3486         ip_fib_init();
3487
3488         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3489         expires_ljiffies = jiffies;
3490         schedule_delayed_work(&expires_work,
3491                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3492
3493         if (ip_rt_proc_init())
3494                 printk(KERN_ERR "Unable to create route proc files\n");
3495 #ifdef CONFIG_XFRM
3496         xfrm_init();
3497         xfrm4_init(ip_rt_max_size);
3498 #endif
3499         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3500
3501 #ifdef CONFIG_SYSCTL
3502         register_pernet_subsys(&sysctl_route_ops);
3503 #endif
3504         register_pernet_subsys(&rt_genid_ops);
3505         return rc;
3506 }
3507
3508 #ifdef CONFIG_SYSCTL
3509 /*
3510  * We really need to sanitize the damn ipv4 init order, then all
3511  * this nonsense will go away.
3512  */
3513 void __init ip_static_sysctl_init(void)
3514 {
3515         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3516 }
3517 #endif