]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv4/route.c
Merge tag 'random_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
[karo-tx-linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #include "fib_lookup.h"
118
119 #define RT_FL_TOS(oldflp4) \
120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121
122 #define RT_GC_TIMEOUT (300*HZ)
123
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly  = 9;
126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly       = HZ;
129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly       = 256;
133
134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145                                            struct sk_buff *skb, u32 mtu);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162         .family =               AF_INET,
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174         .confirm_neigh =        ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    0, /* st->in_hit */
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    0, /* st->out_hit */
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    0, /* st->gc_total */
311                    0, /* st->gc_ignored */
312                    0, /* st->gc_goal_miss */
313                    0, /* st->gc_dst_overflow */
314                    0, /* st->in_hlist_search */
315                    0  /* st->out_hlist_search */
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump_ipv4(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
469 {
470         struct net_device *dev = dst->dev;
471         const __be32 *pkey = daddr;
472         const struct rtable *rt;
473
474         rt = (const struct rtable *)dst;
475         if (rt->rt_gateway)
476                 pkey = (const __be32 *)&rt->rt_gateway;
477         else if (!daddr ||
478                  (rt->rt_flags &
479                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
480                 return;
481
482         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
483 }
484
485 #define IP_IDENTS_SZ 2048u
486
487 static atomic_t *ip_idents __read_mostly;
488 static u32 *ip_tstamps __read_mostly;
489
490 /* In order to protect privacy, we add a perturbation to identifiers
491  * if one generator is seldom used. This makes hard for an attacker
492  * to infer how many packets were sent between two points in time.
493  */
494 u32 ip_idents_reserve(u32 hash, int segs)
495 {
496         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
497         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
498         u32 old = ACCESS_ONCE(*p_tstamp);
499         u32 now = (u32)jiffies;
500         u32 new, delta = 0;
501
502         if (old != now && cmpxchg(p_tstamp, old, now) == old)
503                 delta = prandom_u32_max(now - old);
504
505         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
506         do {
507                 old = (u32)atomic_read(p_id);
508                 new = old + delta + segs;
509         } while (atomic_cmpxchg(p_id, old, new) != old);
510
511         return new - segs;
512 }
513 EXPORT_SYMBOL(ip_idents_reserve);
514
515 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
516 {
517         static u32 ip_idents_hashrnd __read_mostly;
518         u32 hash, id;
519
520         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
521
522         hash = jhash_3words((__force u32)iph->daddr,
523                             (__force u32)iph->saddr,
524                             iph->protocol ^ net_hash_mix(net),
525                             ip_idents_hashrnd);
526         id = ip_idents_reserve(hash, segs);
527         iph->id = htons(id);
528 }
529 EXPORT_SYMBOL(__ip_select_ident);
530
531 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
532                              const struct sock *sk,
533                              const struct iphdr *iph,
534                              int oif, u8 tos,
535                              u8 prot, u32 mark, int flow_flags)
536 {
537         if (sk) {
538                 const struct inet_sock *inet = inet_sk(sk);
539
540                 oif = sk->sk_bound_dev_if;
541                 mark = sk->sk_mark;
542                 tos = RT_CONN_FLAGS(sk);
543                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544         }
545         flowi4_init_output(fl4, oif, mark, tos,
546                            RT_SCOPE_UNIVERSE, prot,
547                            flow_flags,
548                            iph->daddr, iph->saddr, 0, 0,
549                            sock_net_uid(net, sk));
550 }
551
552 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
553                                const struct sock *sk)
554 {
555         const struct net *net = dev_net(skb->dev);
556         const struct iphdr *iph = ip_hdr(skb);
557         int oif = skb->dev->ifindex;
558         u8 tos = RT_TOS(iph->tos);
559         u8 prot = iph->protocol;
560         u32 mark = skb->mark;
561
562         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
563 }
564
565 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
566 {
567         const struct inet_sock *inet = inet_sk(sk);
568         const struct ip_options_rcu *inet_opt;
569         __be32 daddr = inet->inet_daddr;
570
571         rcu_read_lock();
572         inet_opt = rcu_dereference(inet->inet_opt);
573         if (inet_opt && inet_opt->opt.srr)
574                 daddr = inet_opt->opt.faddr;
575         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578                            inet_sk_flowi_flags(sk),
579                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
580         rcu_read_unlock();
581 }
582
583 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584                                  const struct sk_buff *skb)
585 {
586         if (skb)
587                 build_skb_flow_key(fl4, skb, sk);
588         else
589                 build_sk_flow_key(fl4, sk);
590 }
591
592 static DEFINE_SPINLOCK(fnhe_lock);
593
594 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
595 {
596         struct rtable *rt;
597
598         rt = rcu_dereference(fnhe->fnhe_rth_input);
599         if (rt) {
600                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
601                 dst_dev_put(&rt->dst);
602                 dst_release(&rt->dst);
603         }
604         rt = rcu_dereference(fnhe->fnhe_rth_output);
605         if (rt) {
606                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
607                 dst_dev_put(&rt->dst);
608                 dst_release(&rt->dst);
609         }
610 }
611
612 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
613 {
614         struct fib_nh_exception *fnhe, *oldest;
615
616         oldest = rcu_dereference(hash->chain);
617         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
618              fnhe = rcu_dereference(fnhe->fnhe_next)) {
619                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
620                         oldest = fnhe;
621         }
622         fnhe_flush_routes(oldest);
623         return oldest;
624 }
625
626 static inline u32 fnhe_hashfun(__be32 daddr)
627 {
628         static u32 fnhe_hashrnd __read_mostly;
629         u32 hval;
630
631         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
632         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
633         return hash_32(hval, FNHE_HASH_SHIFT);
634 }
635
636 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
637 {
638         rt->rt_pmtu = fnhe->fnhe_pmtu;
639         rt->dst.expires = fnhe->fnhe_expires;
640
641         if (fnhe->fnhe_gw) {
642                 rt->rt_flags |= RTCF_REDIRECTED;
643                 rt->rt_gateway = fnhe->fnhe_gw;
644                 rt->rt_uses_gateway = 1;
645         }
646 }
647
648 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
649                                   u32 pmtu, unsigned long expires)
650 {
651         struct fnhe_hash_bucket *hash;
652         struct fib_nh_exception *fnhe;
653         struct rtable *rt;
654         unsigned int i;
655         int depth;
656         u32 hval = fnhe_hashfun(daddr);
657
658         spin_lock_bh(&fnhe_lock);
659
660         hash = rcu_dereference(nh->nh_exceptions);
661         if (!hash) {
662                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
663                 if (!hash)
664                         goto out_unlock;
665                 rcu_assign_pointer(nh->nh_exceptions, hash);
666         }
667
668         hash += hval;
669
670         depth = 0;
671         for (fnhe = rcu_dereference(hash->chain); fnhe;
672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
673                 if (fnhe->fnhe_daddr == daddr)
674                         break;
675                 depth++;
676         }
677
678         if (fnhe) {
679                 if (gw)
680                         fnhe->fnhe_gw = gw;
681                 if (pmtu) {
682                         fnhe->fnhe_pmtu = pmtu;
683                         fnhe->fnhe_expires = max(1UL, expires);
684                 }
685                 /* Update all cached dsts too */
686                 rt = rcu_dereference(fnhe->fnhe_rth_input);
687                 if (rt)
688                         fill_route_from_fnhe(rt, fnhe);
689                 rt = rcu_dereference(fnhe->fnhe_rth_output);
690                 if (rt)
691                         fill_route_from_fnhe(rt, fnhe);
692         } else {
693                 if (depth > FNHE_RECLAIM_DEPTH)
694                         fnhe = fnhe_oldest(hash);
695                 else {
696                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
697                         if (!fnhe)
698                                 goto out_unlock;
699
700                         fnhe->fnhe_next = hash->chain;
701                         rcu_assign_pointer(hash->chain, fnhe);
702                 }
703                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
704                 fnhe->fnhe_daddr = daddr;
705                 fnhe->fnhe_gw = gw;
706                 fnhe->fnhe_pmtu = pmtu;
707                 fnhe->fnhe_expires = expires;
708
709                 /* Exception created; mark the cached routes for the nexthop
710                  * stale, so anyone caching it rechecks if this exception
711                  * applies to them.
712                  */
713                 rt = rcu_dereference(nh->nh_rth_input);
714                 if (rt)
715                         rt->dst.obsolete = DST_OBSOLETE_KILL;
716
717                 for_each_possible_cpu(i) {
718                         struct rtable __rcu **prt;
719                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
720                         rt = rcu_dereference(*prt);
721                         if (rt)
722                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
723                 }
724         }
725
726         fnhe->fnhe_stamp = jiffies;
727
728 out_unlock:
729         spin_unlock_bh(&fnhe_lock);
730 }
731
732 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
733                              bool kill_route)
734 {
735         __be32 new_gw = icmp_hdr(skb)->un.gateway;
736         __be32 old_gw = ip_hdr(skb)->saddr;
737         struct net_device *dev = skb->dev;
738         struct in_device *in_dev;
739         struct fib_result res;
740         struct neighbour *n;
741         struct net *net;
742
743         switch (icmp_hdr(skb)->code & 7) {
744         case ICMP_REDIR_NET:
745         case ICMP_REDIR_NETTOS:
746         case ICMP_REDIR_HOST:
747         case ICMP_REDIR_HOSTTOS:
748                 break;
749
750         default:
751                 return;
752         }
753
754         if (rt->rt_gateway != old_gw)
755                 return;
756
757         in_dev = __in_dev_get_rcu(dev);
758         if (!in_dev)
759                 return;
760
761         net = dev_net(dev);
762         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
763             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
764             ipv4_is_zeronet(new_gw))
765                 goto reject_redirect;
766
767         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
768                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
769                         goto reject_redirect;
770                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
771                         goto reject_redirect;
772         } else {
773                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
774                         goto reject_redirect;
775         }
776
777         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
778         if (!n)
779                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
780         if (!IS_ERR(n)) {
781                 if (!(n->nud_state & NUD_VALID)) {
782                         neigh_event_send(n, NULL);
783                 } else {
784                         if (fib_lookup(net, fl4, &res, 0) == 0) {
785                                 struct fib_nh *nh = &FIB_RES_NH(res);
786
787                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
788                                                 0, jiffies + ip_rt_gc_timeout);
789                         }
790                         if (kill_route)
791                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
792                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
793                 }
794                 neigh_release(n);
795         }
796         return;
797
798 reject_redirect:
799 #ifdef CONFIG_IP_ROUTE_VERBOSE
800         if (IN_DEV_LOG_MARTIANS(in_dev)) {
801                 const struct iphdr *iph = (const struct iphdr *) skb->data;
802                 __be32 daddr = iph->daddr;
803                 __be32 saddr = iph->saddr;
804
805                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
806                                      "  Advised path = %pI4 -> %pI4\n",
807                                      &old_gw, dev->name, &new_gw,
808                                      &saddr, &daddr);
809         }
810 #endif
811         ;
812 }
813
814 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
815 {
816         struct rtable *rt;
817         struct flowi4 fl4;
818         const struct iphdr *iph = (const struct iphdr *) skb->data;
819         struct net *net = dev_net(skb->dev);
820         int oif = skb->dev->ifindex;
821         u8 tos = RT_TOS(iph->tos);
822         u8 prot = iph->protocol;
823         u32 mark = skb->mark;
824
825         rt = (struct rtable *) dst;
826
827         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
828         __ip_do_redirect(rt, skb, &fl4, true);
829 }
830
831 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
832 {
833         struct rtable *rt = (struct rtable *)dst;
834         struct dst_entry *ret = dst;
835
836         if (rt) {
837                 if (dst->obsolete > 0) {
838                         ip_rt_put(rt);
839                         ret = NULL;
840                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
841                            rt->dst.expires) {
842                         ip_rt_put(rt);
843                         ret = NULL;
844                 }
845         }
846         return ret;
847 }
848
849 /*
850  * Algorithm:
851  *      1. The first ip_rt_redirect_number redirects are sent
852  *         with exponential backoff, then we stop sending them at all,
853  *         assuming that the host ignores our redirects.
854  *      2. If we did not see packets requiring redirects
855  *         during ip_rt_redirect_silence, we assume that the host
856  *         forgot redirected route and start to send redirects again.
857  *
858  * This algorithm is much cheaper and more intelligent than dumb load limiting
859  * in icmp.c.
860  *
861  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
862  * and "frag. need" (breaks PMTU discovery) in icmp.c.
863  */
864
865 void ip_rt_send_redirect(struct sk_buff *skb)
866 {
867         struct rtable *rt = skb_rtable(skb);
868         struct in_device *in_dev;
869         struct inet_peer *peer;
870         struct net *net;
871         int log_martians;
872         int vif;
873
874         rcu_read_lock();
875         in_dev = __in_dev_get_rcu(rt->dst.dev);
876         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
877                 rcu_read_unlock();
878                 return;
879         }
880         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
881         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
882         rcu_read_unlock();
883
884         net = dev_net(rt->dst.dev);
885         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
886         if (!peer) {
887                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
888                           rt_nexthop(rt, ip_hdr(skb)->daddr));
889                 return;
890         }
891
892         /* No redirected packets during ip_rt_redirect_silence;
893          * reset the algorithm.
894          */
895         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
896                 peer->rate_tokens = 0;
897
898         /* Too many ignored redirects; do not send anything
899          * set dst.rate_last to the last seen redirected packet.
900          */
901         if (peer->rate_tokens >= ip_rt_redirect_number) {
902                 peer->rate_last = jiffies;
903                 goto out_put_peer;
904         }
905
906         /* Check for load limit; set rate_last to the latest sent
907          * redirect.
908          */
909         if (peer->rate_tokens == 0 ||
910             time_after(jiffies,
911                        (peer->rate_last +
912                         (ip_rt_redirect_load << peer->rate_tokens)))) {
913                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
914
915                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
916                 peer->rate_last = jiffies;
917                 ++peer->rate_tokens;
918 #ifdef CONFIG_IP_ROUTE_VERBOSE
919                 if (log_martians &&
920                     peer->rate_tokens == ip_rt_redirect_number)
921                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
922                                              &ip_hdr(skb)->saddr, inet_iif(skb),
923                                              &ip_hdr(skb)->daddr, &gw);
924 #endif
925         }
926 out_put_peer:
927         inet_putpeer(peer);
928 }
929
930 static int ip_error(struct sk_buff *skb)
931 {
932         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
933         struct rtable *rt = skb_rtable(skb);
934         struct inet_peer *peer;
935         unsigned long now;
936         struct net *net;
937         bool send;
938         int code;
939
940         /* IP on this device is disabled. */
941         if (!in_dev)
942                 goto out;
943
944         net = dev_net(rt->dst.dev);
945         if (!IN_DEV_FORWARD(in_dev)) {
946                 switch (rt->dst.error) {
947                 case EHOSTUNREACH:
948                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
949                         break;
950
951                 case ENETUNREACH:
952                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
953                         break;
954                 }
955                 goto out;
956         }
957
958         switch (rt->dst.error) {
959         case EINVAL:
960         default:
961                 goto out;
962         case EHOSTUNREACH:
963                 code = ICMP_HOST_UNREACH;
964                 break;
965         case ENETUNREACH:
966                 code = ICMP_NET_UNREACH;
967                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
968                 break;
969         case EACCES:
970                 code = ICMP_PKT_FILTERED;
971                 break;
972         }
973
974         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
975                                l3mdev_master_ifindex(skb->dev), 1);
976
977         send = true;
978         if (peer) {
979                 now = jiffies;
980                 peer->rate_tokens += now - peer->rate_last;
981                 if (peer->rate_tokens > ip_rt_error_burst)
982                         peer->rate_tokens = ip_rt_error_burst;
983                 peer->rate_last = now;
984                 if (peer->rate_tokens >= ip_rt_error_cost)
985                         peer->rate_tokens -= ip_rt_error_cost;
986                 else
987                         send = false;
988                 inet_putpeer(peer);
989         }
990         if (send)
991                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
992
993 out:    kfree_skb(skb);
994         return 0;
995 }
996
997 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
998 {
999         struct dst_entry *dst = &rt->dst;
1000         struct fib_result res;
1001
1002         if (dst_metric_locked(dst, RTAX_MTU))
1003                 return;
1004
1005         if (ipv4_mtu(dst) < mtu)
1006                 return;
1007
1008         if (mtu < ip_rt_min_pmtu)
1009                 mtu = ip_rt_min_pmtu;
1010
1011         if (rt->rt_pmtu == mtu &&
1012             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1013                 return;
1014
1015         rcu_read_lock();
1016         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1017                 struct fib_nh *nh = &FIB_RES_NH(res);
1018
1019                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1020                                       jiffies + ip_rt_mtu_expires);
1021         }
1022         rcu_read_unlock();
1023 }
1024
1025 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1026                               struct sk_buff *skb, u32 mtu)
1027 {
1028         struct rtable *rt = (struct rtable *) dst;
1029         struct flowi4 fl4;
1030
1031         ip_rt_build_flow_key(&fl4, sk, skb);
1032         __ip_rt_update_pmtu(rt, &fl4, mtu);
1033 }
1034
1035 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1036                       int oif, u32 mark, u8 protocol, int flow_flags)
1037 {
1038         const struct iphdr *iph = (const struct iphdr *) skb->data;
1039         struct flowi4 fl4;
1040         struct rtable *rt;
1041
1042         if (!mark)
1043                 mark = IP4_REPLY_MARK(net, skb->mark);
1044
1045         __build_flow_key(net, &fl4, NULL, iph, oif,
1046                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1047         rt = __ip_route_output_key(net, &fl4);
1048         if (!IS_ERR(rt)) {
1049                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1050                 ip_rt_put(rt);
1051         }
1052 }
1053 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1054
1055 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1056 {
1057         const struct iphdr *iph = (const struct iphdr *) skb->data;
1058         struct flowi4 fl4;
1059         struct rtable *rt;
1060
1061         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1062
1063         if (!fl4.flowi4_mark)
1064                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1065
1066         rt = __ip_route_output_key(sock_net(sk), &fl4);
1067         if (!IS_ERR(rt)) {
1068                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069                 ip_rt_put(rt);
1070         }
1071 }
1072
1073 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074 {
1075         const struct iphdr *iph = (const struct iphdr *) skb->data;
1076         struct flowi4 fl4;
1077         struct rtable *rt;
1078         struct dst_entry *odst = NULL;
1079         bool new = false;
1080         struct net *net = sock_net(sk);
1081
1082         bh_lock_sock(sk);
1083
1084         if (!ip_sk_accept_pmtu(sk))
1085                 goto out;
1086
1087         odst = sk_dst_get(sk);
1088
1089         if (sock_owned_by_user(sk) || !odst) {
1090                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1091                 goto out;
1092         }
1093
1094         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1095
1096         rt = (struct rtable *)odst;
1097         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1098                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1099                 if (IS_ERR(rt))
1100                         goto out;
1101
1102                 new = true;
1103         }
1104
1105         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1106
1107         if (!dst_check(&rt->dst, 0)) {
1108                 if (new)
1109                         dst_release(&rt->dst);
1110
1111                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1112                 if (IS_ERR(rt))
1113                         goto out;
1114
1115                 new = true;
1116         }
1117
1118         if (new)
1119                 sk_dst_set(sk, &rt->dst);
1120
1121 out:
1122         bh_unlock_sock(sk);
1123         dst_release(odst);
1124 }
1125 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1126
1127 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1128                    int oif, u32 mark, u8 protocol, int flow_flags)
1129 {
1130         const struct iphdr *iph = (const struct iphdr *) skb->data;
1131         struct flowi4 fl4;
1132         struct rtable *rt;
1133
1134         __build_flow_key(net, &fl4, NULL, iph, oif,
1135                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1136         rt = __ip_route_output_key(net, &fl4);
1137         if (!IS_ERR(rt)) {
1138                 __ip_do_redirect(rt, skb, &fl4, false);
1139                 ip_rt_put(rt);
1140         }
1141 }
1142 EXPORT_SYMBOL_GPL(ipv4_redirect);
1143
1144 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145 {
1146         const struct iphdr *iph = (const struct iphdr *) skb->data;
1147         struct flowi4 fl4;
1148         struct rtable *rt;
1149         struct net *net = sock_net(sk);
1150
1151         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1152         rt = __ip_route_output_key(net, &fl4);
1153         if (!IS_ERR(rt)) {
1154                 __ip_do_redirect(rt, skb, &fl4, false);
1155                 ip_rt_put(rt);
1156         }
1157 }
1158 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159
1160 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161 {
1162         struct rtable *rt = (struct rtable *) dst;
1163
1164         /* All IPV4 dsts are created with ->obsolete set to the value
1165          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166          * into this function always.
1167          *
1168          * When a PMTU/redirect information update invalidates a route,
1169          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170          * DST_OBSOLETE_DEAD by dst_free().
1171          */
1172         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1173                 return NULL;
1174         return dst;
1175 }
1176
1177 static void ipv4_link_failure(struct sk_buff *skb)
1178 {
1179         struct rtable *rt;
1180
1181         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1182
1183         rt = skb_rtable(skb);
1184         if (rt)
1185                 dst_set_expires(&rt->dst, 0);
1186 }
1187
1188 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1189 {
1190         pr_debug("%s: %pI4 -> %pI4, %s\n",
1191                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1192                  skb->dev ? skb->dev->name : "?");
1193         kfree_skb(skb);
1194         WARN_ON(1);
1195         return 0;
1196 }
1197
1198 /*
1199    We do not cache source address of outgoing interface,
1200    because it is used only by IP RR, TS and SRR options,
1201    so that it out of fast path.
1202
1203    BTW remember: "addr" is allowed to be not aligned
1204    in IP options!
1205  */
1206
1207 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1208 {
1209         __be32 src;
1210
1211         if (rt_is_output_route(rt))
1212                 src = ip_hdr(skb)->saddr;
1213         else {
1214                 struct fib_result res;
1215                 struct flowi4 fl4;
1216                 struct iphdr *iph;
1217
1218                 iph = ip_hdr(skb);
1219
1220                 memset(&fl4, 0, sizeof(fl4));
1221                 fl4.daddr = iph->daddr;
1222                 fl4.saddr = iph->saddr;
1223                 fl4.flowi4_tos = RT_TOS(iph->tos);
1224                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1225                 fl4.flowi4_iif = skb->dev->ifindex;
1226                 fl4.flowi4_mark = skb->mark;
1227
1228                 rcu_read_lock();
1229                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1230                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1231                 else
1232                         src = inet_select_addr(rt->dst.dev,
1233                                                rt_nexthop(rt, iph->daddr),
1234                                                RT_SCOPE_UNIVERSE);
1235                 rcu_read_unlock();
1236         }
1237         memcpy(addr, &src, 4);
1238 }
1239
1240 #ifdef CONFIG_IP_ROUTE_CLASSID
1241 static void set_class_tag(struct rtable *rt, u32 tag)
1242 {
1243         if (!(rt->dst.tclassid & 0xFFFF))
1244                 rt->dst.tclassid |= tag & 0xFFFF;
1245         if (!(rt->dst.tclassid & 0xFFFF0000))
1246                 rt->dst.tclassid |= tag & 0xFFFF0000;
1247 }
1248 #endif
1249
1250 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1251 {
1252         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1253         unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1254                                     ip_rt_min_advmss);
1255
1256         return min(advmss, IPV4_MAX_PMTU - header_size);
1257 }
1258
1259 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1260 {
1261         const struct rtable *rt = (const struct rtable *) dst;
1262         unsigned int mtu = rt->rt_pmtu;
1263
1264         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1265                 mtu = dst_metric_raw(dst, RTAX_MTU);
1266
1267         if (mtu)
1268                 return mtu;
1269
1270         mtu = dst->dev->mtu;
1271
1272         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1273                 if (rt->rt_uses_gateway && mtu > 576)
1274                         mtu = 576;
1275         }
1276
1277         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1278
1279         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1280 }
1281
1282 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1283 {
1284         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1285         struct fib_nh_exception *fnhe;
1286         u32 hval;
1287
1288         if (!hash)
1289                 return NULL;
1290
1291         hval = fnhe_hashfun(daddr);
1292
1293         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1294              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1295                 if (fnhe->fnhe_daddr == daddr)
1296                         return fnhe;
1297         }
1298         return NULL;
1299 }
1300
1301 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1302                               __be32 daddr, const bool do_cache)
1303 {
1304         bool ret = false;
1305
1306         spin_lock_bh(&fnhe_lock);
1307
1308         if (daddr == fnhe->fnhe_daddr) {
1309                 struct rtable __rcu **porig;
1310                 struct rtable *orig;
1311                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1312
1313                 if (rt_is_input_route(rt))
1314                         porig = &fnhe->fnhe_rth_input;
1315                 else
1316                         porig = &fnhe->fnhe_rth_output;
1317                 orig = rcu_dereference(*porig);
1318
1319                 if (fnhe->fnhe_genid != genid) {
1320                         fnhe->fnhe_genid = genid;
1321                         fnhe->fnhe_gw = 0;
1322                         fnhe->fnhe_pmtu = 0;
1323                         fnhe->fnhe_expires = 0;
1324                         fnhe_flush_routes(fnhe);
1325                         orig = NULL;
1326                 }
1327                 fill_route_from_fnhe(rt, fnhe);
1328                 if (!rt->rt_gateway)
1329                         rt->rt_gateway = daddr;
1330
1331                 if (do_cache) {
1332                         dst_hold(&rt->dst);
1333                         rcu_assign_pointer(*porig, rt);
1334                         if (orig) {
1335                                 dst_dev_put(&orig->dst);
1336                                 dst_release(&orig->dst);
1337                         }
1338                         ret = true;
1339                 }
1340
1341                 fnhe->fnhe_stamp = jiffies;
1342         }
1343         spin_unlock_bh(&fnhe_lock);
1344
1345         return ret;
1346 }
1347
1348 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1349 {
1350         struct rtable *orig, *prev, **p;
1351         bool ret = true;
1352
1353         if (rt_is_input_route(rt)) {
1354                 p = (struct rtable **)&nh->nh_rth_input;
1355         } else {
1356                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1357         }
1358         orig = *p;
1359
1360         /* hold dst before doing cmpxchg() to avoid race condition
1361          * on this dst
1362          */
1363         dst_hold(&rt->dst);
1364         prev = cmpxchg(p, orig, rt);
1365         if (prev == orig) {
1366                 if (orig) {
1367                         dst_dev_put(&orig->dst);
1368                         dst_release(&orig->dst);
1369                 }
1370         } else {
1371                 dst_release(&rt->dst);
1372                 ret = false;
1373         }
1374
1375         return ret;
1376 }
1377
1378 struct uncached_list {
1379         spinlock_t              lock;
1380         struct list_head        head;
1381 };
1382
1383 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1384
1385 static void rt_add_uncached_list(struct rtable *rt)
1386 {
1387         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1388
1389         rt->rt_uncached_list = ul;
1390
1391         spin_lock_bh(&ul->lock);
1392         list_add_tail(&rt->rt_uncached, &ul->head);
1393         spin_unlock_bh(&ul->lock);
1394 }
1395
1396 static void ipv4_dst_destroy(struct dst_entry *dst)
1397 {
1398         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1399         struct rtable *rt = (struct rtable *) dst;
1400
1401         if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1402                 kfree(p);
1403
1404         if (!list_empty(&rt->rt_uncached)) {
1405                 struct uncached_list *ul = rt->rt_uncached_list;
1406
1407                 spin_lock_bh(&ul->lock);
1408                 list_del(&rt->rt_uncached);
1409                 spin_unlock_bh(&ul->lock);
1410         }
1411 }
1412
1413 void rt_flush_dev(struct net_device *dev)
1414 {
1415         struct net *net = dev_net(dev);
1416         struct rtable *rt;
1417         int cpu;
1418
1419         for_each_possible_cpu(cpu) {
1420                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1421
1422                 spin_lock_bh(&ul->lock);
1423                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1424                         if (rt->dst.dev != dev)
1425                                 continue;
1426                         rt->dst.dev = net->loopback_dev;
1427                         dev_hold(rt->dst.dev);
1428                         dev_put(dev);
1429                 }
1430                 spin_unlock_bh(&ul->lock);
1431         }
1432 }
1433
1434 static bool rt_cache_valid(const struct rtable *rt)
1435 {
1436         return  rt &&
1437                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1438                 !rt_is_expired(rt);
1439 }
1440
1441 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1442                            const struct fib_result *res,
1443                            struct fib_nh_exception *fnhe,
1444                            struct fib_info *fi, u16 type, u32 itag,
1445                            const bool do_cache)
1446 {
1447         bool cached = false;
1448
1449         if (fi) {
1450                 struct fib_nh *nh = &FIB_RES_NH(*res);
1451
1452                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1453                         rt->rt_gateway = nh->nh_gw;
1454                         rt->rt_uses_gateway = 1;
1455                 }
1456                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1457                 if (fi->fib_metrics != &dst_default_metrics) {
1458                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1459                         atomic_inc(&fi->fib_metrics->refcnt);
1460                 }
1461 #ifdef CONFIG_IP_ROUTE_CLASSID
1462                 rt->dst.tclassid = nh->nh_tclassid;
1463 #endif
1464                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1465                 if (unlikely(fnhe))
1466                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1467                 else if (do_cache)
1468                         cached = rt_cache_route(nh, rt);
1469                 if (unlikely(!cached)) {
1470                         /* Routes we intend to cache in nexthop exception or
1471                          * FIB nexthop have the DST_NOCACHE bit clear.
1472                          * However, if we are unsuccessful at storing this
1473                          * route into the cache we really need to set it.
1474                          */
1475                         if (!rt->rt_gateway)
1476                                 rt->rt_gateway = daddr;
1477                         rt_add_uncached_list(rt);
1478                 }
1479         } else
1480                 rt_add_uncached_list(rt);
1481
1482 #ifdef CONFIG_IP_ROUTE_CLASSID
1483 #ifdef CONFIG_IP_MULTIPLE_TABLES
1484         set_class_tag(rt, res->tclassid);
1485 #endif
1486         set_class_tag(rt, itag);
1487 #endif
1488 }
1489
1490 struct rtable *rt_dst_alloc(struct net_device *dev,
1491                             unsigned int flags, u16 type,
1492                             bool nopolicy, bool noxfrm, bool will_cache)
1493 {
1494         struct rtable *rt;
1495
1496         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1497                        (will_cache ? 0 : DST_HOST) |
1498                        (nopolicy ? DST_NOPOLICY : 0) |
1499                        (noxfrm ? DST_NOXFRM : 0));
1500
1501         if (rt) {
1502                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1503                 rt->rt_flags = flags;
1504                 rt->rt_type = type;
1505                 rt->rt_is_input = 0;
1506                 rt->rt_iif = 0;
1507                 rt->rt_pmtu = 0;
1508                 rt->rt_gateway = 0;
1509                 rt->rt_uses_gateway = 0;
1510                 rt->rt_table_id = 0;
1511                 INIT_LIST_HEAD(&rt->rt_uncached);
1512
1513                 rt->dst.output = ip_output;
1514                 if (flags & RTCF_LOCAL)
1515                         rt->dst.input = ip_local_deliver;
1516         }
1517
1518         return rt;
1519 }
1520 EXPORT_SYMBOL(rt_dst_alloc);
1521
1522 /* called in rcu_read_lock() section */
1523 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1524                                 u8 tos, struct net_device *dev, int our)
1525 {
1526         struct rtable *rth;
1527         struct in_device *in_dev = __in_dev_get_rcu(dev);
1528         unsigned int flags = RTCF_MULTICAST;
1529         u32 itag = 0;
1530         int err;
1531
1532         /* Primary sanity checks. */
1533
1534         if (!in_dev)
1535                 return -EINVAL;
1536
1537         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1538             skb->protocol != htons(ETH_P_IP))
1539                 goto e_inval;
1540
1541         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1542                 goto e_inval;
1543
1544         if (ipv4_is_zeronet(saddr)) {
1545                 if (!ipv4_is_local_multicast(daddr))
1546                         goto e_inval;
1547         } else {
1548                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1549                                           in_dev, &itag);
1550                 if (err < 0)
1551                         goto e_err;
1552         }
1553         if (our)
1554                 flags |= RTCF_LOCAL;
1555
1556         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1557                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1558         if (!rth)
1559                 goto e_nobufs;
1560
1561 #ifdef CONFIG_IP_ROUTE_CLASSID
1562         rth->dst.tclassid = itag;
1563 #endif
1564         rth->dst.output = ip_rt_bug;
1565         rth->rt_is_input= 1;
1566
1567 #ifdef CONFIG_IP_MROUTE
1568         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1569                 rth->dst.input = ip_mr_input;
1570 #endif
1571         RT_CACHE_STAT_INC(in_slow_mc);
1572
1573         skb_dst_set(skb, &rth->dst);
1574         return 0;
1575
1576 e_nobufs:
1577         return -ENOBUFS;
1578 e_inval:
1579         return -EINVAL;
1580 e_err:
1581         return err;
1582 }
1583
1584
1585 static void ip_handle_martian_source(struct net_device *dev,
1586                                      struct in_device *in_dev,
1587                                      struct sk_buff *skb,
1588                                      __be32 daddr,
1589                                      __be32 saddr)
1590 {
1591         RT_CACHE_STAT_INC(in_martian_src);
1592 #ifdef CONFIG_IP_ROUTE_VERBOSE
1593         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1594                 /*
1595                  *      RFC1812 recommendation, if source is martian,
1596                  *      the only hint is MAC header.
1597                  */
1598                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1599                         &daddr, &saddr, dev->name);
1600                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1601                         print_hex_dump(KERN_WARNING, "ll header: ",
1602                                        DUMP_PREFIX_OFFSET, 16, 1,
1603                                        skb_mac_header(skb),
1604                                        dev->hard_header_len, true);
1605                 }
1606         }
1607 #endif
1608 }
1609
1610 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1611 {
1612         struct fnhe_hash_bucket *hash;
1613         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1614         u32 hval = fnhe_hashfun(daddr);
1615
1616         spin_lock_bh(&fnhe_lock);
1617
1618         hash = rcu_dereference_protected(nh->nh_exceptions,
1619                                          lockdep_is_held(&fnhe_lock));
1620         hash += hval;
1621
1622         fnhe_p = &hash->chain;
1623         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1624         while (fnhe) {
1625                 if (fnhe->fnhe_daddr == daddr) {
1626                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1627                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1628                         fnhe_flush_routes(fnhe);
1629                         kfree_rcu(fnhe, rcu);
1630                         break;
1631                 }
1632                 fnhe_p = &fnhe->fnhe_next;
1633                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1634                                                  lockdep_is_held(&fnhe_lock));
1635         }
1636
1637         spin_unlock_bh(&fnhe_lock);
1638 }
1639
1640 static void set_lwt_redirect(struct rtable *rth)
1641 {
1642         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1643                 rth->dst.lwtstate->orig_output = rth->dst.output;
1644                 rth->dst.output = lwtunnel_output;
1645         }
1646
1647         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1648                 rth->dst.lwtstate->orig_input = rth->dst.input;
1649                 rth->dst.input = lwtunnel_input;
1650         }
1651 }
1652
1653 /* called in rcu_read_lock() section */
1654 static int __mkroute_input(struct sk_buff *skb,
1655                            const struct fib_result *res,
1656                            struct in_device *in_dev,
1657                            __be32 daddr, __be32 saddr, u32 tos)
1658 {
1659         struct fib_nh_exception *fnhe;
1660         struct rtable *rth;
1661         int err;
1662         struct in_device *out_dev;
1663         bool do_cache;
1664         u32 itag = 0;
1665
1666         /* get a working reference to the output device */
1667         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1668         if (!out_dev) {
1669                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1670                 return -EINVAL;
1671         }
1672
1673         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1674                                   in_dev->dev, in_dev, &itag);
1675         if (err < 0) {
1676                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1677                                          saddr);
1678
1679                 goto cleanup;
1680         }
1681
1682         do_cache = res->fi && !itag;
1683         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1684             skb->protocol == htons(ETH_P_IP) &&
1685             (IN_DEV_SHARED_MEDIA(out_dev) ||
1686              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1687                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1688
1689         if (skb->protocol != htons(ETH_P_IP)) {
1690                 /* Not IP (i.e. ARP). Do not create route, if it is
1691                  * invalid for proxy arp. DNAT routes are always valid.
1692                  *
1693                  * Proxy arp feature have been extended to allow, ARP
1694                  * replies back to the same interface, to support
1695                  * Private VLAN switch technologies. See arp.c.
1696                  */
1697                 if (out_dev == in_dev &&
1698                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1699                         err = -EINVAL;
1700                         goto cleanup;
1701                 }
1702         }
1703
1704         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1705         if (do_cache) {
1706                 if (fnhe) {
1707                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1708                         if (rth && rth->dst.expires &&
1709                             time_after(jiffies, rth->dst.expires)) {
1710                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1711                                 fnhe = NULL;
1712                         } else {
1713                                 goto rt_cache;
1714                         }
1715                 }
1716
1717                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1718
1719 rt_cache:
1720                 if (rt_cache_valid(rth)) {
1721                         skb_dst_set_noref(skb, &rth->dst);
1722                         goto out;
1723                 }
1724         }
1725
1726         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1727                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1728                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1729         if (!rth) {
1730                 err = -ENOBUFS;
1731                 goto cleanup;
1732         }
1733
1734         rth->rt_is_input = 1;
1735         if (res->table)
1736                 rth->rt_table_id = res->table->tb_id;
1737         RT_CACHE_STAT_INC(in_slow_tot);
1738
1739         rth->dst.input = ip_forward;
1740
1741         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1742                        do_cache);
1743         set_lwt_redirect(rth);
1744         skb_dst_set(skb, &rth->dst);
1745 out:
1746         err = 0;
1747  cleanup:
1748         return err;
1749 }
1750
1751 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1752 /* To make ICMP packets follow the right flow, the multipath hash is
1753  * calculated from the inner IP addresses.
1754  */
1755 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1756                                  struct flow_keys *hash_keys)
1757 {
1758         const struct iphdr *outer_iph = ip_hdr(skb);
1759         const struct iphdr *inner_iph;
1760         const struct icmphdr *icmph;
1761         struct iphdr _inner_iph;
1762         struct icmphdr _icmph;
1763
1764         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1765         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1766         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1767                 return;
1768
1769         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1770                 return;
1771
1772         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1773                                    &_icmph);
1774         if (!icmph)
1775                 return;
1776
1777         if (icmph->type != ICMP_DEST_UNREACH &&
1778             icmph->type != ICMP_REDIRECT &&
1779             icmph->type != ICMP_TIME_EXCEEDED &&
1780             icmph->type != ICMP_PARAMETERPROB)
1781                 return;
1782
1783         inner_iph = skb_header_pointer(skb,
1784                                        outer_iph->ihl * 4 + sizeof(_icmph),
1785                                        sizeof(_inner_iph), &_inner_iph);
1786         if (!inner_iph)
1787                 return;
1788         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1789         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1790 }
1791
1792 /* if skb is set it will be used and fl4 can be NULL */
1793 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1794                        const struct sk_buff *skb)
1795 {
1796         struct net *net = fi->fib_net;
1797         struct flow_keys hash_keys;
1798         u32 mhash;
1799
1800         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1801         case 0:
1802                 memset(&hash_keys, 0, sizeof(hash_keys));
1803                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1804                 if (skb) {
1805                         ip_multipath_l3_keys(skb, &hash_keys);
1806                 } else {
1807                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1808                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1809                 }
1810                 break;
1811         case 1:
1812                 /* skb is currently provided only when forwarding */
1813                 if (skb) {
1814                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1815                         struct flow_keys keys;
1816
1817                         /* short-circuit if we already have L4 hash present */
1818                         if (skb->l4_hash)
1819                                 return skb_get_hash_raw(skb) >> 1;
1820                         memset(&hash_keys, 0, sizeof(hash_keys));
1821                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1822                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1823                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1824                         hash_keys.ports.src = keys.ports.src;
1825                         hash_keys.ports.dst = keys.ports.dst;
1826                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1827                 } else {
1828                         memset(&hash_keys, 0, sizeof(hash_keys));
1829                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1830                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1831                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1832                         hash_keys.ports.src = fl4->fl4_sport;
1833                         hash_keys.ports.dst = fl4->fl4_dport;
1834                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1835                 }
1836                 break;
1837         }
1838         mhash = flow_hash_from_keys(&hash_keys);
1839
1840         return mhash >> 1;
1841 }
1842 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1843 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1844
1845 static int ip_mkroute_input(struct sk_buff *skb,
1846                             struct fib_result *res,
1847                             struct in_device *in_dev,
1848                             __be32 daddr, __be32 saddr, u32 tos)
1849 {
1850 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1851         if (res->fi && res->fi->fib_nhs > 1) {
1852                 int h = fib_multipath_hash(res->fi, NULL, skb);
1853
1854                 fib_select_multipath(res, h);
1855         }
1856 #endif
1857
1858         /* create a routing cache entry */
1859         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1860 }
1861
1862 /*
1863  *      NOTE. We drop all the packets that has local source
1864  *      addresses, because every properly looped back packet
1865  *      must have correct destination already attached by output routine.
1866  *
1867  *      Such approach solves two big problems:
1868  *      1. Not simplex devices are handled properly.
1869  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1870  *      called with rcu_read_lock()
1871  */
1872
1873 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1874                                u8 tos, struct net_device *dev,
1875                                struct fib_result *res)
1876 {
1877         struct in_device *in_dev = __in_dev_get_rcu(dev);
1878         struct ip_tunnel_info *tun_info;
1879         struct flowi4   fl4;
1880         unsigned int    flags = 0;
1881         u32             itag = 0;
1882         struct rtable   *rth;
1883         int             err = -EINVAL;
1884         struct net    *net = dev_net(dev);
1885         bool do_cache;
1886
1887         /* IP on this device is disabled. */
1888
1889         if (!in_dev)
1890                 goto out;
1891
1892         /* Check for the most weird martians, which can be not detected
1893            by fib_lookup.
1894          */
1895
1896         tun_info = skb_tunnel_info(skb);
1897         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1898                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1899         else
1900                 fl4.flowi4_tun_key.tun_id = 0;
1901         skb_dst_drop(skb);
1902
1903         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1904                 goto martian_source;
1905
1906         res->fi = NULL;
1907         res->table = NULL;
1908         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1909                 goto brd_input;
1910
1911         /* Accept zero addresses only to limited broadcast;
1912          * I even do not know to fix it or not. Waiting for complains :-)
1913          */
1914         if (ipv4_is_zeronet(saddr))
1915                 goto martian_source;
1916
1917         if (ipv4_is_zeronet(daddr))
1918                 goto martian_destination;
1919
1920         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1921          * and call it once if daddr or/and saddr are loopback addresses
1922          */
1923         if (ipv4_is_loopback(daddr)) {
1924                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1925                         goto martian_destination;
1926         } else if (ipv4_is_loopback(saddr)) {
1927                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1928                         goto martian_source;
1929         }
1930
1931         /*
1932          *      Now we are ready to route packet.
1933          */
1934         fl4.flowi4_oif = 0;
1935         fl4.flowi4_iif = dev->ifindex;
1936         fl4.flowi4_mark = skb->mark;
1937         fl4.flowi4_tos = tos;
1938         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1939         fl4.flowi4_flags = 0;
1940         fl4.daddr = daddr;
1941         fl4.saddr = saddr;
1942         fl4.flowi4_uid = sock_net_uid(net, NULL);
1943         err = fib_lookup(net, &fl4, res, 0);
1944         if (err != 0) {
1945                 if (!IN_DEV_FORWARD(in_dev))
1946                         err = -EHOSTUNREACH;
1947                 goto no_route;
1948         }
1949
1950         if (res->type == RTN_BROADCAST)
1951                 goto brd_input;
1952
1953         if (res->type == RTN_LOCAL) {
1954                 err = fib_validate_source(skb, saddr, daddr, tos,
1955                                           0, dev, in_dev, &itag);
1956                 if (err < 0)
1957                         goto martian_source;
1958                 goto local_input;
1959         }
1960
1961         if (!IN_DEV_FORWARD(in_dev)) {
1962                 err = -EHOSTUNREACH;
1963                 goto no_route;
1964         }
1965         if (res->type != RTN_UNICAST)
1966                 goto martian_destination;
1967
1968         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1969 out:    return err;
1970
1971 brd_input:
1972         if (skb->protocol != htons(ETH_P_IP))
1973                 goto e_inval;
1974
1975         if (!ipv4_is_zeronet(saddr)) {
1976                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1977                                           in_dev, &itag);
1978                 if (err < 0)
1979                         goto martian_source;
1980         }
1981         flags |= RTCF_BROADCAST;
1982         res->type = RTN_BROADCAST;
1983         RT_CACHE_STAT_INC(in_brd);
1984
1985 local_input:
1986         do_cache = false;
1987         if (res->fi) {
1988                 if (!itag) {
1989                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1990                         if (rt_cache_valid(rth)) {
1991                                 skb_dst_set_noref(skb, &rth->dst);
1992                                 err = 0;
1993                                 goto out;
1994                         }
1995                         do_cache = true;
1996                 }
1997         }
1998
1999         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2000                            flags | RTCF_LOCAL, res->type,
2001                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2002         if (!rth)
2003                 goto e_nobufs;
2004
2005         rth->dst.output= ip_rt_bug;
2006 #ifdef CONFIG_IP_ROUTE_CLASSID
2007         rth->dst.tclassid = itag;
2008 #endif
2009         rth->rt_is_input = 1;
2010         if (res->table)
2011                 rth->rt_table_id = res->table->tb_id;
2012
2013         RT_CACHE_STAT_INC(in_slow_tot);
2014         if (res->type == RTN_UNREACHABLE) {
2015                 rth->dst.input= ip_error;
2016                 rth->dst.error= -err;
2017                 rth->rt_flags   &= ~RTCF_LOCAL;
2018         }
2019
2020         if (do_cache) {
2021                 struct fib_nh *nh = &FIB_RES_NH(*res);
2022
2023                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2024                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2025                         WARN_ON(rth->dst.input == lwtunnel_input);
2026                         rth->dst.lwtstate->orig_input = rth->dst.input;
2027                         rth->dst.input = lwtunnel_input;
2028                 }
2029
2030                 if (unlikely(!rt_cache_route(nh, rth)))
2031                         rt_add_uncached_list(rth);
2032         }
2033         skb_dst_set(skb, &rth->dst);
2034         err = 0;
2035         goto out;
2036
2037 no_route:
2038         RT_CACHE_STAT_INC(in_no_route);
2039         res->type = RTN_UNREACHABLE;
2040         res->fi = NULL;
2041         res->table = NULL;
2042         goto local_input;
2043
2044         /*
2045          *      Do not cache martian addresses: they should be logged (RFC1812)
2046          */
2047 martian_destination:
2048         RT_CACHE_STAT_INC(in_martian_dst);
2049 #ifdef CONFIG_IP_ROUTE_VERBOSE
2050         if (IN_DEV_LOG_MARTIANS(in_dev))
2051                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2052                                      &daddr, &saddr, dev->name);
2053 #endif
2054
2055 e_inval:
2056         err = -EINVAL;
2057         goto out;
2058
2059 e_nobufs:
2060         err = -ENOBUFS;
2061         goto out;
2062
2063 martian_source:
2064         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2065         goto out;
2066 }
2067
2068 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2069                          u8 tos, struct net_device *dev)
2070 {
2071         struct fib_result res;
2072         int err;
2073
2074         tos &= IPTOS_RT_MASK;
2075         rcu_read_lock();
2076         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2077         rcu_read_unlock();
2078
2079         return err;
2080 }
2081 EXPORT_SYMBOL(ip_route_input_noref);
2082
2083 /* called with rcu_read_lock held */
2084 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2085                        u8 tos, struct net_device *dev, struct fib_result *res)
2086 {
2087         /* Multicast recognition logic is moved from route cache to here.
2088            The problem was that too many Ethernet cards have broken/missing
2089            hardware multicast filters :-( As result the host on multicasting
2090            network acquires a lot of useless route cache entries, sort of
2091            SDR messages from all the world. Now we try to get rid of them.
2092            Really, provided software IP multicast filter is organized
2093            reasonably (at least, hashed), it does not result in a slowdown
2094            comparing with route cache reject entries.
2095            Note, that multicast routers are not affected, because
2096            route cache entry is created eventually.
2097          */
2098         if (ipv4_is_multicast(daddr)) {
2099                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2100                 int our = 0;
2101                 int err = -EINVAL;
2102
2103                 if (in_dev)
2104                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2105                                               ip_hdr(skb)->protocol);
2106
2107                 /* check l3 master if no match yet */
2108                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2109                         struct in_device *l3_in_dev;
2110
2111                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2112                         if (l3_in_dev)
2113                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2114                                                       ip_hdr(skb)->protocol);
2115                 }
2116
2117                 if (our
2118 #ifdef CONFIG_IP_MROUTE
2119                         ||
2120                     (!ipv4_is_local_multicast(daddr) &&
2121                      IN_DEV_MFORWARD(in_dev))
2122 #endif
2123                    ) {
2124                         err = ip_route_input_mc(skb, daddr, saddr,
2125                                                 tos, dev, our);
2126                 }
2127                 return err;
2128         }
2129
2130         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2131 }
2132
2133 /* called with rcu_read_lock() */
2134 static struct rtable *__mkroute_output(const struct fib_result *res,
2135                                        const struct flowi4 *fl4, int orig_oif,
2136                                        struct net_device *dev_out,
2137                                        unsigned int flags)
2138 {
2139         struct fib_info *fi = res->fi;
2140         struct fib_nh_exception *fnhe;
2141         struct in_device *in_dev;
2142         u16 type = res->type;
2143         struct rtable *rth;
2144         bool do_cache;
2145
2146         in_dev = __in_dev_get_rcu(dev_out);
2147         if (!in_dev)
2148                 return ERR_PTR(-EINVAL);
2149
2150         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2151                 if (ipv4_is_loopback(fl4->saddr) &&
2152                     !(dev_out->flags & IFF_LOOPBACK) &&
2153                     !netif_is_l3_master(dev_out))
2154                         return ERR_PTR(-EINVAL);
2155
2156         if (ipv4_is_lbcast(fl4->daddr))
2157                 type = RTN_BROADCAST;
2158         else if (ipv4_is_multicast(fl4->daddr))
2159                 type = RTN_MULTICAST;
2160         else if (ipv4_is_zeronet(fl4->daddr))
2161                 return ERR_PTR(-EINVAL);
2162
2163         if (dev_out->flags & IFF_LOOPBACK)
2164                 flags |= RTCF_LOCAL;
2165
2166         do_cache = true;
2167         if (type == RTN_BROADCAST) {
2168                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2169                 fi = NULL;
2170         } else if (type == RTN_MULTICAST) {
2171                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2172                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2173                                      fl4->flowi4_proto))
2174                         flags &= ~RTCF_LOCAL;
2175                 else
2176                         do_cache = false;
2177                 /* If multicast route do not exist use
2178                  * default one, but do not gateway in this case.
2179                  * Yes, it is hack.
2180                  */
2181                 if (fi && res->prefixlen < 4)
2182                         fi = NULL;
2183         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2184                    (orig_oif != dev_out->ifindex)) {
2185                 /* For local routes that require a particular output interface
2186                  * we do not want to cache the result.  Caching the result
2187                  * causes incorrect behaviour when there are multiple source
2188                  * addresses on the interface, the end result being that if the
2189                  * intended recipient is waiting on that interface for the
2190                  * packet he won't receive it because it will be delivered on
2191                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2192                  * be set to the loopback interface as well.
2193                  */
2194                 fi = NULL;
2195         }
2196
2197         fnhe = NULL;
2198         do_cache &= fi != NULL;
2199         if (do_cache) {
2200                 struct rtable __rcu **prth;
2201                 struct fib_nh *nh = &FIB_RES_NH(*res);
2202
2203                 fnhe = find_exception(nh, fl4->daddr);
2204                 if (fnhe) {
2205                         prth = &fnhe->fnhe_rth_output;
2206                         rth = rcu_dereference(*prth);
2207                         if (rth && rth->dst.expires &&
2208                             time_after(jiffies, rth->dst.expires)) {
2209                                 ip_del_fnhe(nh, fl4->daddr);
2210                                 fnhe = NULL;
2211                         } else {
2212                                 goto rt_cache;
2213                         }
2214                 }
2215
2216                 if (unlikely(fl4->flowi4_flags &
2217                              FLOWI_FLAG_KNOWN_NH &&
2218                              !(nh->nh_gw &&
2219                                nh->nh_scope == RT_SCOPE_LINK))) {
2220                         do_cache = false;
2221                         goto add;
2222                 }
2223                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2224                 rth = rcu_dereference(*prth);
2225
2226 rt_cache:
2227                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2228                         return rth;
2229         }
2230
2231 add:
2232         rth = rt_dst_alloc(dev_out, flags, type,
2233                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2234                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2235                            do_cache);
2236         if (!rth)
2237                 return ERR_PTR(-ENOBUFS);
2238
2239         rth->rt_iif     = orig_oif ? : 0;
2240         if (res->table)
2241                 rth->rt_table_id = res->table->tb_id;
2242
2243         RT_CACHE_STAT_INC(out_slow_tot);
2244
2245         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2246                 if (flags & RTCF_LOCAL &&
2247                     !(dev_out->flags & IFF_LOOPBACK)) {
2248                         rth->dst.output = ip_mc_output;
2249                         RT_CACHE_STAT_INC(out_slow_mc);
2250                 }
2251 #ifdef CONFIG_IP_MROUTE
2252                 if (type == RTN_MULTICAST) {
2253                         if (IN_DEV_MFORWARD(in_dev) &&
2254                             !ipv4_is_local_multicast(fl4->daddr)) {
2255                                 rth->dst.input = ip_mr_input;
2256                                 rth->dst.output = ip_mc_output;
2257                         }
2258                 }
2259 #endif
2260         }
2261
2262         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2263         set_lwt_redirect(rth);
2264
2265         return rth;
2266 }
2267
2268 /*
2269  * Major route resolver routine.
2270  */
2271
2272 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2273                                         const struct sk_buff *skb)
2274 {
2275         __u8 tos = RT_FL_TOS(fl4);
2276         struct fib_result res;
2277         struct rtable *rth;
2278
2279         res.tclassid    = 0;
2280         res.fi          = NULL;
2281         res.table       = NULL;
2282
2283         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2284         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2285         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2286                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2287
2288         rcu_read_lock();
2289         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2290         rcu_read_unlock();
2291
2292         return rth;
2293 }
2294 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2295
2296 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2297                                             struct fib_result *res,
2298                                             const struct sk_buff *skb)
2299 {
2300         struct net_device *dev_out = NULL;
2301         int orig_oif = fl4->flowi4_oif;
2302         unsigned int flags = 0;
2303         struct rtable *rth;
2304         int err = -ENETUNREACH;
2305
2306         if (fl4->saddr) {
2307                 rth = ERR_PTR(-EINVAL);
2308                 if (ipv4_is_multicast(fl4->saddr) ||
2309                     ipv4_is_lbcast(fl4->saddr) ||
2310                     ipv4_is_zeronet(fl4->saddr))
2311                         goto out;
2312
2313                 /* I removed check for oif == dev_out->oif here.
2314                    It was wrong for two reasons:
2315                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2316                       is assigned to multiple interfaces.
2317                    2. Moreover, we are allowed to send packets with saddr
2318                       of another iface. --ANK
2319                  */
2320
2321                 if (fl4->flowi4_oif == 0 &&
2322                     (ipv4_is_multicast(fl4->daddr) ||
2323                      ipv4_is_lbcast(fl4->daddr))) {
2324                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2325                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2326                         if (!dev_out)
2327                                 goto out;
2328
2329                         /* Special hack: user can direct multicasts
2330                            and limited broadcast via necessary interface
2331                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2332                            This hack is not just for fun, it allows
2333                            vic,vat and friends to work.
2334                            They bind socket to loopback, set ttl to zero
2335                            and expect that it will work.
2336                            From the viewpoint of routing cache they are broken,
2337                            because we are not allowed to build multicast path
2338                            with loopback source addr (look, routing cache
2339                            cannot know, that ttl is zero, so that packet
2340                            will not leave this host and route is valid).
2341                            Luckily, this hack is good workaround.
2342                          */
2343
2344                         fl4->flowi4_oif = dev_out->ifindex;
2345                         goto make_route;
2346                 }
2347
2348                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2349                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2350                         if (!__ip_dev_find(net, fl4->saddr, false))
2351                                 goto out;
2352                 }
2353         }
2354
2355
2356         if (fl4->flowi4_oif) {
2357                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2358                 rth = ERR_PTR(-ENODEV);
2359                 if (!dev_out)
2360                         goto out;
2361
2362                 /* RACE: Check return value of inet_select_addr instead. */
2363                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2364                         rth = ERR_PTR(-ENETUNREACH);
2365                         goto out;
2366                 }
2367                 if (ipv4_is_local_multicast(fl4->daddr) ||
2368                     ipv4_is_lbcast(fl4->daddr) ||
2369                     fl4->flowi4_proto == IPPROTO_IGMP) {
2370                         if (!fl4->saddr)
2371                                 fl4->saddr = inet_select_addr(dev_out, 0,
2372                                                               RT_SCOPE_LINK);
2373                         goto make_route;
2374                 }
2375                 if (!fl4->saddr) {
2376                         if (ipv4_is_multicast(fl4->daddr))
2377                                 fl4->saddr = inet_select_addr(dev_out, 0,
2378                                                               fl4->flowi4_scope);
2379                         else if (!fl4->daddr)
2380                                 fl4->saddr = inet_select_addr(dev_out, 0,
2381                                                               RT_SCOPE_HOST);
2382                 }
2383         }
2384
2385         if (!fl4->daddr) {
2386                 fl4->daddr = fl4->saddr;
2387                 if (!fl4->daddr)
2388                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2389                 dev_out = net->loopback_dev;
2390                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2391                 res->type = RTN_LOCAL;
2392                 flags |= RTCF_LOCAL;
2393                 goto make_route;
2394         }
2395
2396         err = fib_lookup(net, fl4, res, 0);
2397         if (err) {
2398                 res->fi = NULL;
2399                 res->table = NULL;
2400                 if (fl4->flowi4_oif &&
2401                     (ipv4_is_multicast(fl4->daddr) ||
2402                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2403                         /* Apparently, routing tables are wrong. Assume,
2404                            that the destination is on link.
2405
2406                            WHY? DW.
2407                            Because we are allowed to send to iface
2408                            even if it has NO routes and NO assigned
2409                            addresses. When oif is specified, routing
2410                            tables are looked up with only one purpose:
2411                            to catch if destination is gatewayed, rather than
2412                            direct. Moreover, if MSG_DONTROUTE is set,
2413                            we send packet, ignoring both routing tables
2414                            and ifaddr state. --ANK
2415
2416
2417                            We could make it even if oif is unknown,
2418                            likely IPv6, but we do not.
2419                          */
2420
2421                         if (fl4->saddr == 0)
2422                                 fl4->saddr = inet_select_addr(dev_out, 0,
2423                                                               RT_SCOPE_LINK);
2424                         res->type = RTN_UNICAST;
2425                         goto make_route;
2426                 }
2427                 rth = ERR_PTR(err);
2428                 goto out;
2429         }
2430
2431         if (res->type == RTN_LOCAL) {
2432                 if (!fl4->saddr) {
2433                         if (res->fi->fib_prefsrc)
2434                                 fl4->saddr = res->fi->fib_prefsrc;
2435                         else
2436                                 fl4->saddr = fl4->daddr;
2437                 }
2438
2439                 /* L3 master device is the loopback for that domain */
2440                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2441                         net->loopback_dev;
2442                 fl4->flowi4_oif = dev_out->ifindex;
2443                 flags |= RTCF_LOCAL;
2444                 goto make_route;
2445         }
2446
2447         fib_select_path(net, res, fl4, skb);
2448
2449         dev_out = FIB_RES_DEV(*res);
2450         fl4->flowi4_oif = dev_out->ifindex;
2451
2452
2453 make_route:
2454         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2455
2456 out:
2457         return rth;
2458 }
2459
2460 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2461 {
2462         return NULL;
2463 }
2464
2465 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2466 {
2467         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2468
2469         return mtu ? : dst->dev->mtu;
2470 }
2471
2472 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2473                                           struct sk_buff *skb, u32 mtu)
2474 {
2475 }
2476
2477 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2478                                        struct sk_buff *skb)
2479 {
2480 }
2481
2482 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2483                                           unsigned long old)
2484 {
2485         return NULL;
2486 }
2487
2488 static struct dst_ops ipv4_dst_blackhole_ops = {
2489         .family                 =       AF_INET,
2490         .check                  =       ipv4_blackhole_dst_check,
2491         .mtu                    =       ipv4_blackhole_mtu,
2492         .default_advmss         =       ipv4_default_advmss,
2493         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2494         .redirect               =       ipv4_rt_blackhole_redirect,
2495         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2496         .neigh_lookup           =       ipv4_neigh_lookup,
2497 };
2498
2499 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2500 {
2501         struct rtable *ort = (struct rtable *) dst_orig;
2502         struct rtable *rt;
2503
2504         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2505         if (rt) {
2506                 struct dst_entry *new = &rt->dst;
2507
2508                 new->__use = 1;
2509                 new->input = dst_discard;
2510                 new->output = dst_discard_out;
2511
2512                 new->dev = net->loopback_dev;
2513                 if (new->dev)
2514                         dev_hold(new->dev);
2515
2516                 rt->rt_is_input = ort->rt_is_input;
2517                 rt->rt_iif = ort->rt_iif;
2518                 rt->rt_pmtu = ort->rt_pmtu;
2519
2520                 rt->rt_genid = rt_genid_ipv4(net);
2521                 rt->rt_flags = ort->rt_flags;
2522                 rt->rt_type = ort->rt_type;
2523                 rt->rt_gateway = ort->rt_gateway;
2524                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2525
2526                 INIT_LIST_HEAD(&rt->rt_uncached);
2527         }
2528
2529         dst_release(dst_orig);
2530
2531         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2532 }
2533
2534 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2535                                     const struct sock *sk)
2536 {
2537         struct rtable *rt = __ip_route_output_key(net, flp4);
2538
2539         if (IS_ERR(rt))
2540                 return rt;
2541
2542         if (flp4->flowi4_proto)
2543                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2544                                                         flowi4_to_flowi(flp4),
2545                                                         sk, 0);
2546
2547         return rt;
2548 }
2549 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2550
2551 /* called with rcu_read_lock held */
2552 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2553                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2554                         u32 seq)
2555 {
2556         struct rtable *rt = skb_rtable(skb);
2557         struct rtmsg *r;
2558         struct nlmsghdr *nlh;
2559         unsigned long expires = 0;
2560         u32 error;
2561         u32 metrics[RTAX_MAX];
2562
2563         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2564         if (!nlh)
2565                 return -EMSGSIZE;
2566
2567         r = nlmsg_data(nlh);
2568         r->rtm_family    = AF_INET;
2569         r->rtm_dst_len  = 32;
2570         r->rtm_src_len  = 0;
2571         r->rtm_tos      = fl4->flowi4_tos;
2572         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2573         if (nla_put_u32(skb, RTA_TABLE, table_id))
2574                 goto nla_put_failure;
2575         r->rtm_type     = rt->rt_type;
2576         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2577         r->rtm_protocol = RTPROT_UNSPEC;
2578         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2579         if (rt->rt_flags & RTCF_NOTIFY)
2580                 r->rtm_flags |= RTM_F_NOTIFY;
2581         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2582                 r->rtm_flags |= RTCF_DOREDIRECT;
2583
2584         if (nla_put_in_addr(skb, RTA_DST, dst))
2585                 goto nla_put_failure;
2586         if (src) {
2587                 r->rtm_src_len = 32;
2588                 if (nla_put_in_addr(skb, RTA_SRC, src))
2589                         goto nla_put_failure;
2590         }
2591         if (rt->dst.dev &&
2592             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2593                 goto nla_put_failure;
2594 #ifdef CONFIG_IP_ROUTE_CLASSID
2595         if (rt->dst.tclassid &&
2596             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2597                 goto nla_put_failure;
2598 #endif
2599         if (!rt_is_input_route(rt) &&
2600             fl4->saddr != src) {
2601                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2602                         goto nla_put_failure;
2603         }
2604         if (rt->rt_uses_gateway &&
2605             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2606                 goto nla_put_failure;
2607
2608         expires = rt->dst.expires;
2609         if (expires) {
2610                 unsigned long now = jiffies;
2611
2612                 if (time_before(now, expires))
2613                         expires -= now;
2614                 else
2615                         expires = 0;
2616         }
2617
2618         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2619         if (rt->rt_pmtu && expires)
2620                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2621         if (rtnetlink_put_metrics(skb, metrics) < 0)
2622                 goto nla_put_failure;
2623
2624         if (fl4->flowi4_mark &&
2625             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2626                 goto nla_put_failure;
2627
2628         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2629             nla_put_u32(skb, RTA_UID,
2630                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2631                 goto nla_put_failure;
2632
2633         error = rt->dst.error;
2634
2635         if (rt_is_input_route(rt)) {
2636 #ifdef CONFIG_IP_MROUTE
2637                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2638                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2639                         int err = ipmr_get_route(net, skb,
2640                                                  fl4->saddr, fl4->daddr,
2641                                                  r, portid);
2642
2643                         if (err <= 0) {
2644                                 if (err == 0)
2645                                         return 0;
2646                                 goto nla_put_failure;
2647                         }
2648                 } else
2649 #endif
2650                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2651                                 goto nla_put_failure;
2652         }
2653
2654         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2655                 goto nla_put_failure;
2656
2657         nlmsg_end(skb, nlh);
2658         return 0;
2659
2660 nla_put_failure:
2661         nlmsg_cancel(skb, nlh);
2662         return -EMSGSIZE;
2663 }
2664
2665 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2666                              struct netlink_ext_ack *extack)
2667 {
2668         struct net *net = sock_net(in_skb->sk);
2669         struct rtmsg *rtm;
2670         struct nlattr *tb[RTA_MAX+1];
2671         struct fib_result res = {};
2672         struct rtable *rt = NULL;
2673         struct flowi4 fl4;
2674         __be32 dst = 0;
2675         __be32 src = 0;
2676         u32 iif;
2677         int err;
2678         int mark;
2679         struct sk_buff *skb;
2680         u32 table_id = RT_TABLE_MAIN;
2681         kuid_t uid;
2682
2683         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2684                           extack);
2685         if (err < 0)
2686                 goto errout;
2687
2688         rtm = nlmsg_data(nlh);
2689
2690         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2691         if (!skb) {
2692                 err = -ENOBUFS;
2693                 goto errout;
2694         }
2695
2696         /* Reserve room for dummy headers, this skb can pass
2697            through good chunk of routing engine.
2698          */
2699         skb_reset_mac_header(skb);
2700         skb_reset_network_header(skb);
2701
2702         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2703         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2704         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2705         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2706         if (tb[RTA_UID])
2707                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2708         else
2709                 uid = (iif ? INVALID_UID : current_uid());
2710
2711         /* Bugfix: need to give ip_route_input enough of an IP header to
2712          * not gag.
2713          */
2714         ip_hdr(skb)->protocol = IPPROTO_UDP;
2715         ip_hdr(skb)->saddr = src;
2716         ip_hdr(skb)->daddr = dst;
2717
2718         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2719
2720         memset(&fl4, 0, sizeof(fl4));
2721         fl4.daddr = dst;
2722         fl4.saddr = src;
2723         fl4.flowi4_tos = rtm->rtm_tos;
2724         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2725         fl4.flowi4_mark = mark;
2726         fl4.flowi4_uid = uid;
2727
2728         rcu_read_lock();
2729
2730         if (iif) {
2731                 struct net_device *dev;
2732
2733                 dev = dev_get_by_index_rcu(net, iif);
2734                 if (!dev) {
2735                         err = -ENODEV;
2736                         goto errout_free;
2737                 }
2738
2739                 skb->protocol   = htons(ETH_P_IP);
2740                 skb->dev        = dev;
2741                 skb->mark       = mark;
2742                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2743                                          dev, &res);
2744
2745                 rt = skb_rtable(skb);
2746                 if (err == 0 && rt->dst.error)
2747                         err = -rt->dst.error;
2748         } else {
2749                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2750                 err = 0;
2751                 if (IS_ERR(rt))
2752                         err = PTR_ERR(rt);
2753         }
2754
2755         if (err)
2756                 goto errout_free;
2757
2758         skb_dst_set(skb, &rt->dst);
2759         if (rtm->rtm_flags & RTM_F_NOTIFY)
2760                 rt->rt_flags |= RTCF_NOTIFY;
2761
2762         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2763                 table_id = rt->rt_table_id;
2764
2765         if (rtm->rtm_flags & RTM_F_FIB_MATCH)
2766                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2767                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2768                                     rt->rt_type, res.prefix, res.prefixlen,
2769                                     fl4.flowi4_tos, res.fi, 0);
2770         else
2771                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2772                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2773         if (err < 0)
2774                 goto errout_free;
2775
2776         rcu_read_unlock();
2777
2778         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2779 errout:
2780         return err;
2781
2782 errout_free:
2783         rcu_read_unlock();
2784         kfree_skb(skb);
2785         goto errout;
2786 }
2787
2788 void ip_rt_multicast_event(struct in_device *in_dev)
2789 {
2790         rt_cache_flush(dev_net(in_dev->dev));
2791 }
2792
2793 #ifdef CONFIG_SYSCTL
2794 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2795 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2796 static int ip_rt_gc_elasticity __read_mostly    = 8;
2797
2798 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2799                                         void __user *buffer,
2800                                         size_t *lenp, loff_t *ppos)
2801 {
2802         struct net *net = (struct net *)__ctl->extra1;
2803
2804         if (write) {
2805                 rt_cache_flush(net);
2806                 fnhe_genid_bump(net);
2807                 return 0;
2808         }
2809
2810         return -EINVAL;
2811 }
2812
2813 static struct ctl_table ipv4_route_table[] = {
2814         {
2815                 .procname       = "gc_thresh",
2816                 .data           = &ipv4_dst_ops.gc_thresh,
2817                 .maxlen         = sizeof(int),
2818                 .mode           = 0644,
2819                 .proc_handler   = proc_dointvec,
2820         },
2821         {
2822                 .procname       = "max_size",
2823                 .data           = &ip_rt_max_size,
2824                 .maxlen         = sizeof(int),
2825                 .mode           = 0644,
2826                 .proc_handler   = proc_dointvec,
2827         },
2828         {
2829                 /*  Deprecated. Use gc_min_interval_ms */
2830
2831                 .procname       = "gc_min_interval",
2832                 .data           = &ip_rt_gc_min_interval,
2833                 .maxlen         = sizeof(int),
2834                 .mode           = 0644,
2835                 .proc_handler   = proc_dointvec_jiffies,
2836         },
2837         {
2838                 .procname       = "gc_min_interval_ms",
2839                 .data           = &ip_rt_gc_min_interval,
2840                 .maxlen         = sizeof(int),
2841                 .mode           = 0644,
2842                 .proc_handler   = proc_dointvec_ms_jiffies,
2843         },
2844         {
2845                 .procname       = "gc_timeout",
2846                 .data           = &ip_rt_gc_timeout,
2847                 .maxlen         = sizeof(int),
2848                 .mode           = 0644,
2849                 .proc_handler   = proc_dointvec_jiffies,
2850         },
2851         {
2852                 .procname       = "gc_interval",
2853                 .data           = &ip_rt_gc_interval,
2854                 .maxlen         = sizeof(int),
2855                 .mode           = 0644,
2856                 .proc_handler   = proc_dointvec_jiffies,
2857         },
2858         {
2859                 .procname       = "redirect_load",
2860                 .data           = &ip_rt_redirect_load,
2861                 .maxlen         = sizeof(int),
2862                 .mode           = 0644,
2863                 .proc_handler   = proc_dointvec,
2864         },
2865         {
2866                 .procname       = "redirect_number",
2867                 .data           = &ip_rt_redirect_number,
2868                 .maxlen         = sizeof(int),
2869                 .mode           = 0644,
2870                 .proc_handler   = proc_dointvec,
2871         },
2872         {
2873                 .procname       = "redirect_silence",
2874                 .data           = &ip_rt_redirect_silence,
2875                 .maxlen         = sizeof(int),
2876                 .mode           = 0644,
2877                 .proc_handler   = proc_dointvec,
2878         },
2879         {
2880                 .procname       = "error_cost",
2881                 .data           = &ip_rt_error_cost,
2882                 .maxlen         = sizeof(int),
2883                 .mode           = 0644,
2884                 .proc_handler   = proc_dointvec,
2885         },
2886         {
2887                 .procname       = "error_burst",
2888                 .data           = &ip_rt_error_burst,
2889                 .maxlen         = sizeof(int),
2890                 .mode           = 0644,
2891                 .proc_handler   = proc_dointvec,
2892         },
2893         {
2894                 .procname       = "gc_elasticity",
2895                 .data           = &ip_rt_gc_elasticity,
2896                 .maxlen         = sizeof(int),
2897                 .mode           = 0644,
2898                 .proc_handler   = proc_dointvec,
2899         },
2900         {
2901                 .procname       = "mtu_expires",
2902                 .data           = &ip_rt_mtu_expires,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = proc_dointvec_jiffies,
2906         },
2907         {
2908                 .procname       = "min_pmtu",
2909                 .data           = &ip_rt_min_pmtu,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = proc_dointvec,
2913         },
2914         {
2915                 .procname       = "min_adv_mss",
2916                 .data           = &ip_rt_min_advmss,
2917                 .maxlen         = sizeof(int),
2918                 .mode           = 0644,
2919                 .proc_handler   = proc_dointvec,
2920         },
2921         { }
2922 };
2923
2924 static struct ctl_table ipv4_route_flush_table[] = {
2925         {
2926                 .procname       = "flush",
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0200,
2929                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2930         },
2931         { },
2932 };
2933
2934 static __net_init int sysctl_route_net_init(struct net *net)
2935 {
2936         struct ctl_table *tbl;
2937
2938         tbl = ipv4_route_flush_table;
2939         if (!net_eq(net, &init_net)) {
2940                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2941                 if (!tbl)
2942                         goto err_dup;
2943
2944                 /* Don't export sysctls to unprivileged users */
2945                 if (net->user_ns != &init_user_ns)
2946                         tbl[0].procname = NULL;
2947         }
2948         tbl[0].extra1 = net;
2949
2950         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2951         if (!net->ipv4.route_hdr)
2952                 goto err_reg;
2953         return 0;
2954
2955 err_reg:
2956         if (tbl != ipv4_route_flush_table)
2957                 kfree(tbl);
2958 err_dup:
2959         return -ENOMEM;
2960 }
2961
2962 static __net_exit void sysctl_route_net_exit(struct net *net)
2963 {
2964         struct ctl_table *tbl;
2965
2966         tbl = net->ipv4.route_hdr->ctl_table_arg;
2967         unregister_net_sysctl_table(net->ipv4.route_hdr);
2968         BUG_ON(tbl == ipv4_route_flush_table);
2969         kfree(tbl);
2970 }
2971
2972 static __net_initdata struct pernet_operations sysctl_route_ops = {
2973         .init = sysctl_route_net_init,
2974         .exit = sysctl_route_net_exit,
2975 };
2976 #endif
2977
2978 static __net_init int rt_genid_init(struct net *net)
2979 {
2980         atomic_set(&net->ipv4.rt_genid, 0);
2981         atomic_set(&net->fnhe_genid, 0);
2982         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
2983         return 0;
2984 }
2985
2986 static __net_initdata struct pernet_operations rt_genid_ops = {
2987         .init = rt_genid_init,
2988 };
2989
2990 static int __net_init ipv4_inetpeer_init(struct net *net)
2991 {
2992         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2993
2994         if (!bp)
2995                 return -ENOMEM;
2996         inet_peer_base_init(bp);
2997         net->ipv4.peers = bp;
2998         return 0;
2999 }
3000
3001 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3002 {
3003         struct inet_peer_base *bp = net->ipv4.peers;
3004
3005         net->ipv4.peers = NULL;
3006         inetpeer_invalidate_tree(bp);
3007         kfree(bp);
3008 }
3009
3010 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3011         .init   =       ipv4_inetpeer_init,
3012         .exit   =       ipv4_inetpeer_exit,
3013 };
3014
3015 #ifdef CONFIG_IP_ROUTE_CLASSID
3016 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3017 #endif /* CONFIG_IP_ROUTE_CLASSID */
3018
3019 int __init ip_rt_init(void)
3020 {
3021         int rc = 0;
3022         int cpu;
3023
3024         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3025         if (!ip_idents)
3026                 panic("IP: failed to allocate ip_idents\n");
3027
3028         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3029
3030         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3031         if (!ip_tstamps)
3032                 panic("IP: failed to allocate ip_tstamps\n");
3033
3034         for_each_possible_cpu(cpu) {
3035                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3036
3037                 INIT_LIST_HEAD(&ul->head);
3038                 spin_lock_init(&ul->lock);
3039         }
3040 #ifdef CONFIG_IP_ROUTE_CLASSID
3041         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3042         if (!ip_rt_acct)
3043                 panic("IP: failed to allocate ip_rt_acct\n");
3044 #endif
3045
3046         ipv4_dst_ops.kmem_cachep =
3047                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3048                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3049
3050         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3051
3052         if (dst_entries_init(&ipv4_dst_ops) < 0)
3053                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3054
3055         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3056                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3057
3058         ipv4_dst_ops.gc_thresh = ~0;
3059         ip_rt_max_size = INT_MAX;
3060
3061         devinet_init();
3062         ip_fib_init();
3063
3064         if (ip_rt_proc_init())
3065                 pr_err("Unable to create route proc files\n");
3066 #ifdef CONFIG_XFRM
3067         xfrm_init();
3068         xfrm4_init();
3069 #endif
3070         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3071
3072 #ifdef CONFIG_SYSCTL
3073         register_pernet_subsys(&sysctl_route_ops);
3074 #endif
3075         register_pernet_subsys(&rt_genid_ops);
3076         register_pernet_subsys(&ipv4_inetpeer_ops);
3077         return rc;
3078 }
3079
3080 #ifdef CONFIG_SYSCTL
3081 /*
3082  * We really need to sanitize the damn ipv4 init order, then all
3083  * this nonsense will go away.
3084  */
3085 void __init ip_static_sysctl_init(void)
3086 {
3087         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3088 }
3089 #endif