]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv4/ip_tunnel.c
097b3e7c1e8f89052f6dd686d519a3a9f0624209
[karo-tx-linux.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67         return hash_32((__force u32)key ^ (__force u32)remote,
68                          IP_TNL_HASH_BITS);
69 }
70
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72                              struct dst_entry *dst)
73 {
74         struct dst_entry *old_dst;
75
76         if (dst) {
77                 if (dst->flags & DST_NOCACHE)
78                         dst = NULL;
79                 else
80                         dst_clone(dst);
81         }
82         old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83         dst_release(old_dst);
84 }
85
86 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88         __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90
91 static void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93         tunnel_dst_set(t, NULL);
94 }
95
96 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98         int i;
99
100         for_each_possible_cpu(i)
101                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
104
105 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
106 {
107         struct dst_entry *dst;
108
109         rcu_read_lock();
110         dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
111         if (dst) {
112                 if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
113                         rcu_read_unlock();
114                         tunnel_dst_reset(t);
115                         return NULL;
116                 }
117                 dst_hold(dst);
118         }
119         rcu_read_unlock();
120         return (struct rtable *)dst;
121 }
122
123 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
124                                 __be16 flags, __be32 key)
125 {
126         if (p->i_flags & TUNNEL_KEY) {
127                 if (flags & TUNNEL_KEY)
128                         return key == p->i_key;
129                 else
130                         /* key expected, none present */
131                         return false;
132         } else
133                 return !(flags & TUNNEL_KEY);
134 }
135
136 /* Fallback tunnel: no source, no destination, no key, no options
137
138    Tunnel hash table:
139    We require exact key match i.e. if a key is present in packet
140    it will match only tunnel with the same key; if it is not present,
141    it will match only keyless tunnel.
142
143    All keysless packets, if not matched configured keyless tunnels
144    will match fallback tunnel.
145    Given src, dst and key, find appropriate for input tunnel.
146 */
147 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
148                                    int link, __be16 flags,
149                                    __be32 remote, __be32 local,
150                                    __be32 key)
151 {
152         unsigned int hash;
153         struct ip_tunnel *t, *cand = NULL;
154         struct hlist_head *head;
155
156         hash = ip_tunnel_hash(key, remote);
157         head = &itn->tunnels[hash];
158
159         hlist_for_each_entry_rcu(t, head, hash_node) {
160                 if (local != t->parms.iph.saddr ||
161                     remote != t->parms.iph.daddr ||
162                     !(t->dev->flags & IFF_UP))
163                         continue;
164
165                 if (!ip_tunnel_key_match(&t->parms, flags, key))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else
171                         cand = t;
172         }
173
174         hlist_for_each_entry_rcu(t, head, hash_node) {
175                 if (remote != t->parms.iph.daddr ||
176                     !(t->dev->flags & IFF_UP))
177                         continue;
178
179                 if (!ip_tunnel_key_match(&t->parms, flags, key))
180                         continue;
181
182                 if (t->parms.link == link)
183                         return t;
184                 else if (!cand)
185                         cand = t;
186         }
187
188         hash = ip_tunnel_hash(key, 0);
189         head = &itn->tunnels[hash];
190
191         hlist_for_each_entry_rcu(t, head, hash_node) {
192                 if ((local != t->parms.iph.saddr &&
193                      (local != t->parms.iph.daddr ||
194                       !ipv4_is_multicast(local))) ||
195                     !(t->dev->flags & IFF_UP))
196                         continue;
197
198                 if (!ip_tunnel_key_match(&t->parms, flags, key))
199                         continue;
200
201                 if (t->parms.link == link)
202                         return t;
203                 else if (!cand)
204                         cand = t;
205         }
206
207         if (flags & TUNNEL_NO_KEY)
208                 goto skip_key_lookup;
209
210         hlist_for_each_entry_rcu(t, head, hash_node) {
211                 if (t->parms.i_key != key ||
212                     !(t->dev->flags & IFF_UP))
213                         continue;
214
215                 if (t->parms.link == link)
216                         return t;
217                 else if (!cand)
218                         cand = t;
219         }
220
221 skip_key_lookup:
222         if (cand)
223                 return cand;
224
225         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
226                 return netdev_priv(itn->fb_tunnel_dev);
227
228
229         return NULL;
230 }
231 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
232
233 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
234                                     struct ip_tunnel_parm *parms)
235 {
236         unsigned int h;
237         __be32 remote;
238         __be32 i_key = parms->i_key;
239
240         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
241                 remote = parms->iph.daddr;
242         else
243                 remote = 0;
244
245         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
246                 i_key = 0;
247
248         h = ip_tunnel_hash(i_key, remote);
249         return &itn->tunnels[h];
250 }
251
252 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
253 {
254         struct hlist_head *head = ip_bucket(itn, &t->parms);
255
256         hlist_add_head_rcu(&t->hash_node, head);
257 }
258
259 static void ip_tunnel_del(struct ip_tunnel *t)
260 {
261         hlist_del_init_rcu(&t->hash_node);
262 }
263
264 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
265                                         struct ip_tunnel_parm *parms,
266                                         int type)
267 {
268         __be32 remote = parms->iph.daddr;
269         __be32 local = parms->iph.saddr;
270         __be32 key = parms->i_key;
271         __be16 flags = parms->i_flags;
272         int link = parms->link;
273         struct ip_tunnel *t = NULL;
274         struct hlist_head *head = ip_bucket(itn, parms);
275
276         hlist_for_each_entry_rcu(t, head, hash_node) {
277                 if (local == t->parms.iph.saddr &&
278                     remote == t->parms.iph.daddr &&
279                     link == t->parms.link &&
280                     type == t->dev->type &&
281                     ip_tunnel_key_match(&t->parms, flags, key))
282                         break;
283         }
284         return t;
285 }
286
287 static struct net_device *__ip_tunnel_create(struct net *net,
288                                              const struct rtnl_link_ops *ops,
289                                              struct ip_tunnel_parm *parms)
290 {
291         int err;
292         struct ip_tunnel *tunnel;
293         struct net_device *dev;
294         char name[IFNAMSIZ];
295
296         if (parms->name[0])
297                 strlcpy(name, parms->name, IFNAMSIZ);
298         else {
299                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
300                         err = -E2BIG;
301                         goto failed;
302                 }
303                 strlcpy(name, ops->kind, IFNAMSIZ);
304                 strncat(name, "%d", 2);
305         }
306
307         ASSERT_RTNL();
308         dev = alloc_netdev(ops->priv_size, name, ops->setup);
309         if (!dev) {
310                 err = -ENOMEM;
311                 goto failed;
312         }
313         dev_net_set(dev, net);
314
315         dev->rtnl_link_ops = ops;
316
317         tunnel = netdev_priv(dev);
318         tunnel->parms = *parms;
319         tunnel->net = net;
320
321         err = register_netdevice(dev);
322         if (err)
323                 goto failed_free;
324
325         return dev;
326
327 failed_free:
328         free_netdev(dev);
329 failed:
330         return ERR_PTR(err);
331 }
332
333 static inline void init_tunnel_flow(struct flowi4 *fl4,
334                                     int proto,
335                                     __be32 daddr, __be32 saddr,
336                                     __be32 key, __u8 tos, int oif)
337 {
338         memset(fl4, 0, sizeof(*fl4));
339         fl4->flowi4_oif = oif;
340         fl4->daddr = daddr;
341         fl4->saddr = saddr;
342         fl4->flowi4_tos = tos;
343         fl4->flowi4_proto = proto;
344         fl4->fl4_gre_key = key;
345 }
346
347 static int ip_tunnel_bind_dev(struct net_device *dev)
348 {
349         struct net_device *tdev = NULL;
350         struct ip_tunnel *tunnel = netdev_priv(dev);
351         const struct iphdr *iph;
352         int hlen = LL_MAX_HEADER;
353         int mtu = ETH_DATA_LEN;
354         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
355
356         iph = &tunnel->parms.iph;
357
358         /* Guess output device to choose reasonable mtu and needed_headroom */
359         if (iph->daddr) {
360                 struct flowi4 fl4;
361                 struct rtable *rt;
362
363                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
364                                  iph->saddr, tunnel->parms.o_key,
365                                  RT_TOS(iph->tos), tunnel->parms.link);
366                 rt = ip_route_output_key(tunnel->net, &fl4);
367
368                 if (!IS_ERR(rt)) {
369                         tdev = rt->dst.dev;
370                         tunnel_dst_set(tunnel, &rt->dst);
371                         ip_rt_put(rt);
372                 }
373                 if (dev->type != ARPHRD_ETHER)
374                         dev->flags |= IFF_POINTOPOINT;
375         }
376
377         if (!tdev && tunnel->parms.link)
378                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
379
380         if (tdev) {
381                 hlen = tdev->hard_header_len + tdev->needed_headroom;
382                 mtu = tdev->mtu;
383         }
384         dev->iflink = tunnel->parms.link;
385
386         dev->needed_headroom = t_hlen + hlen;
387         mtu -= (dev->hard_header_len + t_hlen);
388
389         if (mtu < 68)
390                 mtu = 68;
391
392         return mtu;
393 }
394
395 static struct ip_tunnel *ip_tunnel_create(struct net *net,
396                                           struct ip_tunnel_net *itn,
397                                           struct ip_tunnel_parm *parms)
398 {
399         struct ip_tunnel *nt;
400         struct net_device *dev;
401
402         BUG_ON(!itn->fb_tunnel_dev);
403         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
404         if (IS_ERR(dev))
405                 return ERR_CAST(dev);
406
407         dev->mtu = ip_tunnel_bind_dev(dev);
408
409         nt = netdev_priv(dev);
410         ip_tunnel_add(itn, nt);
411         return nt;
412 }
413
414 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
415                   const struct tnl_ptk_info *tpi, bool log_ecn_error)
416 {
417         struct pcpu_sw_netstats *tstats;
418         const struct iphdr *iph = ip_hdr(skb);
419         int err;
420
421 #ifdef CONFIG_NET_IPGRE_BROADCAST
422         if (ipv4_is_multicast(iph->daddr)) {
423                 tunnel->dev->stats.multicast++;
424                 skb->pkt_type = PACKET_BROADCAST;
425         }
426 #endif
427
428         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
429              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
430                 tunnel->dev->stats.rx_crc_errors++;
431                 tunnel->dev->stats.rx_errors++;
432                 goto drop;
433         }
434
435         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
436                 if (!(tpi->flags&TUNNEL_SEQ) ||
437                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
438                         tunnel->dev->stats.rx_fifo_errors++;
439                         tunnel->dev->stats.rx_errors++;
440                         goto drop;
441                 }
442                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
443         }
444
445         skb_reset_network_header(skb);
446
447         err = IP_ECN_decapsulate(iph, skb);
448         if (unlikely(err)) {
449                 if (log_ecn_error)
450                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
451                                         &iph->saddr, iph->tos);
452                 if (err > 1) {
453                         ++tunnel->dev->stats.rx_frame_errors;
454                         ++tunnel->dev->stats.rx_errors;
455                         goto drop;
456                 }
457         }
458
459         tstats = this_cpu_ptr(tunnel->dev->tstats);
460         u64_stats_update_begin(&tstats->syncp);
461         tstats->rx_packets++;
462         tstats->rx_bytes += skb->len;
463         u64_stats_update_end(&tstats->syncp);
464
465         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
466
467         if (tunnel->dev->type == ARPHRD_ETHER) {
468                 skb->protocol = eth_type_trans(skb, tunnel->dev);
469                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
470         } else {
471                 skb->dev = tunnel->dev;
472         }
473
474         gro_cells_receive(&tunnel->gro_cells, skb);
475         return 0;
476
477 drop:
478         kfree_skb(skb);
479         return 0;
480 }
481 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
482
483 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
484                             struct rtable *rt, __be16 df)
485 {
486         struct ip_tunnel *tunnel = netdev_priv(dev);
487         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
488         int mtu;
489
490         if (df)
491                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
492                                         - sizeof(struct iphdr) - tunnel->hlen;
493         else
494                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
495
496         if (skb_dst(skb))
497                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
498
499         if (skb->protocol == htons(ETH_P_IP)) {
500                 if (!skb_is_gso(skb) &&
501                     (df & htons(IP_DF)) && mtu < pkt_size) {
502                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
503                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
504                         return -E2BIG;
505                 }
506         }
507 #if IS_ENABLED(CONFIG_IPV6)
508         else if (skb->protocol == htons(ETH_P_IPV6)) {
509                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
510
511                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
512                            mtu >= IPV6_MIN_MTU) {
513                         if ((tunnel->parms.iph.daddr &&
514                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
515                             rt6->rt6i_dst.plen == 128) {
516                                 rt6->rt6i_flags |= RTF_MODIFIED;
517                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
518                         }
519                 }
520
521                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
522                                         mtu < pkt_size) {
523                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
524                         return -E2BIG;
525                 }
526         }
527 #endif
528         return 0;
529 }
530
531 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
532                     const struct iphdr *tnl_params, const u8 protocol)
533 {
534         struct ip_tunnel *tunnel = netdev_priv(dev);
535         const struct iphdr *inner_iph;
536         struct flowi4 fl4;
537         u8     tos, ttl;
538         __be16 df;
539         struct rtable *rt;              /* Route to the other host */
540         unsigned int max_headroom;      /* The extra header space needed */
541         __be32 dst;
542         int err;
543         bool connected;
544
545         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
546         connected = (tunnel->parms.iph.daddr != 0);
547
548         dst = tnl_params->daddr;
549         if (dst == 0) {
550                 /* NBMA tunnel */
551
552                 if (skb_dst(skb) == NULL) {
553                         dev->stats.tx_fifo_errors++;
554                         goto tx_error;
555                 }
556
557                 if (skb->protocol == htons(ETH_P_IP)) {
558                         rt = skb_rtable(skb);
559                         dst = rt_nexthop(rt, inner_iph->daddr);
560                 }
561 #if IS_ENABLED(CONFIG_IPV6)
562                 else if (skb->protocol == htons(ETH_P_IPV6)) {
563                         const struct in6_addr *addr6;
564                         struct neighbour *neigh;
565                         bool do_tx_error_icmp;
566                         int addr_type;
567
568                         neigh = dst_neigh_lookup(skb_dst(skb),
569                                                  &ipv6_hdr(skb)->daddr);
570                         if (neigh == NULL)
571                                 goto tx_error;
572
573                         addr6 = (const struct in6_addr *)&neigh->primary_key;
574                         addr_type = ipv6_addr_type(addr6);
575
576                         if (addr_type == IPV6_ADDR_ANY) {
577                                 addr6 = &ipv6_hdr(skb)->daddr;
578                                 addr_type = ipv6_addr_type(addr6);
579                         }
580
581                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
582                                 do_tx_error_icmp = true;
583                         else {
584                                 do_tx_error_icmp = false;
585                                 dst = addr6->s6_addr32[3];
586                         }
587                         neigh_release(neigh);
588                         if (do_tx_error_icmp)
589                                 goto tx_error_icmp;
590                 }
591 #endif
592                 else
593                         goto tx_error;
594
595                 connected = false;
596         }
597
598         tos = tnl_params->tos;
599         if (tos & 0x1) {
600                 tos &= ~0x1;
601                 if (skb->protocol == htons(ETH_P_IP)) {
602                         tos = inner_iph->tos;
603                         connected = false;
604                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
605                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
606                         connected = false;
607                 }
608         }
609
610         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
611                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
612
613         rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
614
615         if (!rt) {
616                 rt = ip_route_output_key(tunnel->net, &fl4);
617
618                 if (IS_ERR(rt)) {
619                         dev->stats.tx_carrier_errors++;
620                         goto tx_error;
621                 }
622                 if (connected)
623                         tunnel_dst_set(tunnel, &rt->dst);
624         }
625
626         if (rt->dst.dev == dev) {
627                 ip_rt_put(rt);
628                 dev->stats.collisions++;
629                 goto tx_error;
630         }
631
632         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
633                 ip_rt_put(rt);
634                 goto tx_error;
635         }
636
637         if (tunnel->err_count > 0) {
638                 if (time_before(jiffies,
639                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
640                         tunnel->err_count--;
641
642                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
643                         dst_link_failure(skb);
644                 } else
645                         tunnel->err_count = 0;
646         }
647
648         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
649         ttl = tnl_params->ttl;
650         if (ttl == 0) {
651                 if (skb->protocol == htons(ETH_P_IP))
652                         ttl = inner_iph->ttl;
653 #if IS_ENABLED(CONFIG_IPV6)
654                 else if (skb->protocol == htons(ETH_P_IPV6))
655                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
656 #endif
657                 else
658                         ttl = ip4_dst_hoplimit(&rt->dst);
659         }
660
661         df = tnl_params->frag_off;
662         if (skb->protocol == htons(ETH_P_IP))
663                 df |= (inner_iph->frag_off&htons(IP_DF));
664
665         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
666                         + rt->dst.header_len;
667         if (max_headroom > dev->needed_headroom)
668                 dev->needed_headroom = max_headroom;
669
670         if (skb_cow_head(skb, dev->needed_headroom)) {
671                 ip_rt_put(rt);
672                 dev->stats.tx_dropped++;
673                 kfree_skb(skb);
674                 return;
675         }
676
677         err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
678                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
679         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
680
681         return;
682
683 #if IS_ENABLED(CONFIG_IPV6)
684 tx_error_icmp:
685         dst_link_failure(skb);
686 #endif
687 tx_error:
688         dev->stats.tx_errors++;
689         kfree_skb(skb);
690 }
691 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
692
693 static void ip_tunnel_update(struct ip_tunnel_net *itn,
694                              struct ip_tunnel *t,
695                              struct net_device *dev,
696                              struct ip_tunnel_parm *p,
697                              bool set_mtu)
698 {
699         ip_tunnel_del(t);
700         t->parms.iph.saddr = p->iph.saddr;
701         t->parms.iph.daddr = p->iph.daddr;
702         t->parms.i_key = p->i_key;
703         t->parms.o_key = p->o_key;
704         if (dev->type != ARPHRD_ETHER) {
705                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
706                 memcpy(dev->broadcast, &p->iph.daddr, 4);
707         }
708         ip_tunnel_add(itn, t);
709
710         t->parms.iph.ttl = p->iph.ttl;
711         t->parms.iph.tos = p->iph.tos;
712         t->parms.iph.frag_off = p->iph.frag_off;
713
714         if (t->parms.link != p->link) {
715                 int mtu;
716
717                 t->parms.link = p->link;
718                 mtu = ip_tunnel_bind_dev(dev);
719                 if (set_mtu)
720                         dev->mtu = mtu;
721         }
722         ip_tunnel_dst_reset_all(t);
723         netdev_state_change(dev);
724 }
725
726 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
727 {
728         int err = 0;
729         struct ip_tunnel *t = netdev_priv(dev);
730         struct net *net = t->net;
731         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
732
733         BUG_ON(!itn->fb_tunnel_dev);
734         switch (cmd) {
735         case SIOCGETTUNNEL:
736                 if (dev == itn->fb_tunnel_dev) {
737                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
738                         if (t == NULL)
739                                 t = netdev_priv(dev);
740                 }
741                 memcpy(p, &t->parms, sizeof(*p));
742                 break;
743
744         case SIOCADDTUNNEL:
745         case SIOCCHGTUNNEL:
746                 err = -EPERM;
747                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
748                         goto done;
749                 if (p->iph.ttl)
750                         p->iph.frag_off |= htons(IP_DF);
751                 if (!(p->i_flags & VTI_ISVTI)) {
752                         if (!(p->i_flags & TUNNEL_KEY))
753                                 p->i_key = 0;
754                         if (!(p->o_flags & TUNNEL_KEY))
755                                 p->o_key = 0;
756                 }
757
758                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
759
760                 if (!t && (cmd == SIOCADDTUNNEL)) {
761                         t = ip_tunnel_create(net, itn, p);
762                         err = PTR_ERR_OR_ZERO(t);
763                         break;
764                 }
765                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
766                         if (t != NULL) {
767                                 if (t->dev != dev) {
768                                         err = -EEXIST;
769                                         break;
770                                 }
771                         } else {
772                                 unsigned int nflags = 0;
773
774                                 if (ipv4_is_multicast(p->iph.daddr))
775                                         nflags = IFF_BROADCAST;
776                                 else if (p->iph.daddr)
777                                         nflags = IFF_POINTOPOINT;
778
779                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
780                                         err = -EINVAL;
781                                         break;
782                                 }
783
784                                 t = netdev_priv(dev);
785                         }
786                 }
787
788                 if (t) {
789                         err = 0;
790                         ip_tunnel_update(itn, t, dev, p, true);
791                 } else {
792                         err = -ENOENT;
793                 }
794                 break;
795
796         case SIOCDELTUNNEL:
797                 err = -EPERM;
798                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
799                         goto done;
800
801                 if (dev == itn->fb_tunnel_dev) {
802                         err = -ENOENT;
803                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
804                         if (t == NULL)
805                                 goto done;
806                         err = -EPERM;
807                         if (t == netdev_priv(itn->fb_tunnel_dev))
808                                 goto done;
809                         dev = t->dev;
810                 }
811                 unregister_netdevice(dev);
812                 err = 0;
813                 break;
814
815         default:
816                 err = -EINVAL;
817         }
818
819 done:
820         return err;
821 }
822 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
823
824 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
825 {
826         struct ip_tunnel *tunnel = netdev_priv(dev);
827         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
828
829         if (new_mtu < 68 ||
830             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
831                 return -EINVAL;
832         dev->mtu = new_mtu;
833         return 0;
834 }
835 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
836
837 static void ip_tunnel_dev_free(struct net_device *dev)
838 {
839         struct ip_tunnel *tunnel = netdev_priv(dev);
840
841         gro_cells_destroy(&tunnel->gro_cells);
842         free_percpu(tunnel->dst_cache);
843         free_percpu(dev->tstats);
844         free_netdev(dev);
845 }
846
847 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
848 {
849         struct ip_tunnel *tunnel = netdev_priv(dev);
850         struct ip_tunnel_net *itn;
851
852         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
853
854         if (itn->fb_tunnel_dev != dev) {
855                 ip_tunnel_del(netdev_priv(dev));
856                 unregister_netdevice_queue(dev, head);
857         }
858 }
859 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
860
861 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
862                                   struct rtnl_link_ops *ops, char *devname)
863 {
864         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
865         struct ip_tunnel_parm parms;
866         unsigned int i;
867
868         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
869                 INIT_HLIST_HEAD(&itn->tunnels[i]);
870
871         if (!ops) {
872                 itn->fb_tunnel_dev = NULL;
873                 return 0;
874         }
875
876         memset(&parms, 0, sizeof(parms));
877         if (devname)
878                 strlcpy(parms.name, devname, IFNAMSIZ);
879
880         rtnl_lock();
881         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
882         /* FB netdevice is special: we have one, and only one per netns.
883          * Allowing to move it to another netns is clearly unsafe.
884          */
885         if (!IS_ERR(itn->fb_tunnel_dev)) {
886                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
887                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
888                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
889         }
890         rtnl_unlock();
891
892         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
893 }
894 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
895
896 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
897                               struct rtnl_link_ops *ops)
898 {
899         struct net *net = dev_net(itn->fb_tunnel_dev);
900         struct net_device *dev, *aux;
901         int h;
902
903         for_each_netdev_safe(net, dev, aux)
904                 if (dev->rtnl_link_ops == ops)
905                         unregister_netdevice_queue(dev, head);
906
907         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
908                 struct ip_tunnel *t;
909                 struct hlist_node *n;
910                 struct hlist_head *thead = &itn->tunnels[h];
911
912                 hlist_for_each_entry_safe(t, n, thead, hash_node)
913                         /* If dev is in the same netns, it has already
914                          * been added to the list by the previous loop.
915                          */
916                         if (!net_eq(dev_net(t->dev), net))
917                                 unregister_netdevice_queue(t->dev, head);
918         }
919 }
920
921 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
922 {
923         LIST_HEAD(list);
924
925         rtnl_lock();
926         ip_tunnel_destroy(itn, &list, ops);
927         unregister_netdevice_many(&list);
928         rtnl_unlock();
929 }
930 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
931
932 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
933                       struct ip_tunnel_parm *p)
934 {
935         struct ip_tunnel *nt;
936         struct net *net = dev_net(dev);
937         struct ip_tunnel_net *itn;
938         int mtu;
939         int err;
940
941         nt = netdev_priv(dev);
942         itn = net_generic(net, nt->ip_tnl_net_id);
943
944         if (ip_tunnel_find(itn, p, dev->type))
945                 return -EEXIST;
946
947         nt->net = net;
948         nt->parms = *p;
949         err = register_netdevice(dev);
950         if (err)
951                 goto out;
952
953         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
954                 eth_hw_addr_random(dev);
955
956         mtu = ip_tunnel_bind_dev(dev);
957         if (!tb[IFLA_MTU])
958                 dev->mtu = mtu;
959
960         ip_tunnel_add(itn, nt);
961
962 out:
963         return err;
964 }
965 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
966
967 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
968                          struct ip_tunnel_parm *p)
969 {
970         struct ip_tunnel *t;
971         struct ip_tunnel *tunnel = netdev_priv(dev);
972         struct net *net = tunnel->net;
973         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
974
975         if (dev == itn->fb_tunnel_dev)
976                 return -EINVAL;
977
978         t = ip_tunnel_find(itn, p, dev->type);
979
980         if (t) {
981                 if (t->dev != dev)
982                         return -EEXIST;
983         } else {
984                 t = tunnel;
985
986                 if (dev->type != ARPHRD_ETHER) {
987                         unsigned int nflags = 0;
988
989                         if (ipv4_is_multicast(p->iph.daddr))
990                                 nflags = IFF_BROADCAST;
991                         else if (p->iph.daddr)
992                                 nflags = IFF_POINTOPOINT;
993
994                         if ((dev->flags ^ nflags) &
995                             (IFF_POINTOPOINT | IFF_BROADCAST))
996                                 return -EINVAL;
997                 }
998         }
999
1000         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1001         return 0;
1002 }
1003 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1004
1005 int ip_tunnel_init(struct net_device *dev)
1006 {
1007         struct ip_tunnel *tunnel = netdev_priv(dev);
1008         struct iphdr *iph = &tunnel->parms.iph;
1009         int err;
1010
1011         dev->destructor = ip_tunnel_dev_free;
1012         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1013         if (!dev->tstats)
1014                 return -ENOMEM;
1015
1016         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1017         if (!tunnel->dst_cache) {
1018                 free_percpu(dev->tstats);
1019                 return -ENOMEM;
1020         }
1021
1022         err = gro_cells_init(&tunnel->gro_cells, dev);
1023         if (err) {
1024                 free_percpu(tunnel->dst_cache);
1025                 free_percpu(dev->tstats);
1026                 return err;
1027         }
1028
1029         tunnel->dev = dev;
1030         tunnel->net = dev_net(dev);
1031         strcpy(tunnel->parms.name, dev->name);
1032         iph->version            = 4;
1033         iph->ihl                = 5;
1034
1035         return 0;
1036 }
1037 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1038
1039 void ip_tunnel_uninit(struct net_device *dev)
1040 {
1041         struct ip_tunnel *tunnel = netdev_priv(dev);
1042         struct net *net = tunnel->net;
1043         struct ip_tunnel_net *itn;
1044
1045         itn = net_generic(net, tunnel->ip_tnl_net_id);
1046         /* fb_tunnel_dev will be unregisted in net-exit call. */
1047         if (itn->fb_tunnel_dev != dev)
1048                 ip_tunnel_del(netdev_priv(dev));
1049
1050         ip_tunnel_dst_reset_all(tunnel);
1051 }
1052 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1053
1054 /* Do least required initialization, rest of init is done in tunnel_init call */
1055 void ip_tunnel_setup(struct net_device *dev, int net_id)
1056 {
1057         struct ip_tunnel *tunnel = netdev_priv(dev);
1058         tunnel->ip_tnl_net_id = net_id;
1059 }
1060 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1061
1062 MODULE_LICENSE("GPL");