]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv6/ip6_output.c
clk: keystone: sci-clk: Fix sci_clk_get
[karo-tx-linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/bpf-cgroup.h>
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58 #include <linux/mroute6.h>
59 #include <net/l3mdev.h>
60 #include <net/lwtunnel.h>
61
62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63 {
64         struct dst_entry *dst = skb_dst(skb);
65         struct net_device *dev = dst->dev;
66         struct neighbour *neigh;
67         struct in6_addr *nexthop;
68         int ret;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107                 int res = lwtunnel_xmit(skb);
108
109                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110                         return res;
111         }
112
113         rcu_read_lock_bh();
114         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116         if (unlikely(!neigh))
117                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118         if (!IS_ERR(neigh)) {
119                 sock_confirm_neigh(skb, neigh);
120                 ret = neigh_output(neigh, skb);
121                 rcu_read_unlock_bh();
122                 return ret;
123         }
124         rcu_read_unlock_bh();
125
126         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127         kfree_skb(skb);
128         return -EINVAL;
129 }
130
131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132 {
133         int ret;
134
135         ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136         if (ret) {
137                 kfree_skb(skb);
138                 return ret;
139         }
140
141         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
142             dst_allfrag(skb_dst(skb)) ||
143             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
144                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
145         else
146                 return ip6_finish_output2(net, sk, skb);
147 }
148
149 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
150 {
151         struct net_device *dev = skb_dst(skb)->dev;
152         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
153
154         skb->protocol = htons(ETH_P_IPV6);
155         skb->dev = dev;
156
157         if (unlikely(idev->cnf.disable_ipv6)) {
158                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
159                 kfree_skb(skb);
160                 return 0;
161         }
162
163         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
164                             net, sk, skb, NULL, dev,
165                             ip6_finish_output,
166                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
167 }
168
169 /*
170  * xmit an sk_buff (used by TCP, SCTP and DCCP)
171  * Note : socket lock is not held for SYNACK packets, but might be modified
172  * by calls to skb_set_owner_w() and ipv6_local_error(),
173  * which are using proper atomic operations or spinlocks.
174  */
175 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
176              __u32 mark, struct ipv6_txoptions *opt, int tclass)
177 {
178         struct net *net = sock_net(sk);
179         const struct ipv6_pinfo *np = inet6_sk(sk);
180         struct in6_addr *first_hop = &fl6->daddr;
181         struct dst_entry *dst = skb_dst(skb);
182         struct ipv6hdr *hdr;
183         u8  proto = fl6->flowi6_proto;
184         int seg_len = skb->len;
185         int hlimit = -1;
186         u32 mtu;
187
188         if (opt) {
189                 unsigned int head_room;
190
191                 /* First: exthdrs may take lots of space (~8K for now)
192                    MAX_HEADER is not enough.
193                  */
194                 head_room = opt->opt_nflen + opt->opt_flen;
195                 seg_len += head_room;
196                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
197
198                 if (skb_headroom(skb) < head_room) {
199                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
200                         if (!skb2) {
201                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
202                                               IPSTATS_MIB_OUTDISCARDS);
203                                 kfree_skb(skb);
204                                 return -ENOBUFS;
205                         }
206                         consume_skb(skb);
207                         skb = skb2;
208                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
209                          * it is safe to call in our context (socket lock not held)
210                          */
211                         skb_set_owner_w(skb, (struct sock *)sk);
212                 }
213                 if (opt->opt_flen)
214                         ipv6_push_frag_opts(skb, opt, &proto);
215                 if (opt->opt_nflen)
216                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
217                                              &fl6->saddr);
218         }
219
220         skb_push(skb, sizeof(struct ipv6hdr));
221         skb_reset_network_header(skb);
222         hdr = ipv6_hdr(skb);
223
224         /*
225          *      Fill in the IPv6 header
226          */
227         if (np)
228                 hlimit = np->hop_limit;
229         if (hlimit < 0)
230                 hlimit = ip6_dst_hoplimit(dst);
231
232         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
233                                                      np->autoflowlabel, fl6));
234
235         hdr->payload_len = htons(seg_len);
236         hdr->nexthdr = proto;
237         hdr->hop_limit = hlimit;
238
239         hdr->saddr = fl6->saddr;
240         hdr->daddr = *first_hop;
241
242         skb->protocol = htons(ETH_P_IPV6);
243         skb->priority = sk->sk_priority;
244         skb->mark = mark;
245
246         mtu = dst_mtu(dst);
247         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
248                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
249                               IPSTATS_MIB_OUT, skb->len);
250
251                 /* if egress device is enslaved to an L3 master device pass the
252                  * skb to its handler for processing
253                  */
254                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
255                 if (unlikely(!skb))
256                         return 0;
257
258                 /* hooks should never assume socket lock is held.
259                  * we promote our socket to non const
260                  */
261                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
262                                net, (struct sock *)sk, skb, NULL, dst->dev,
263                                dst_output);
264         }
265
266         skb->dev = dst->dev;
267         /* ipv6_local_error() does not require socket lock,
268          * we promote our socket to non const
269          */
270         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
271
272         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
273         kfree_skb(skb);
274         return -EMSGSIZE;
275 }
276 EXPORT_SYMBOL(ip6_xmit);
277
278 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
279 {
280         struct ip6_ra_chain *ra;
281         struct sock *last = NULL;
282
283         read_lock(&ip6_ra_lock);
284         for (ra = ip6_ra_chain; ra; ra = ra->next) {
285                 struct sock *sk = ra->sk;
286                 if (sk && ra->sel == sel &&
287                     (!sk->sk_bound_dev_if ||
288                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
289                         if (last) {
290                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
291                                 if (skb2)
292                                         rawv6_rcv(last, skb2);
293                         }
294                         last = sk;
295                 }
296         }
297
298         if (last) {
299                 rawv6_rcv(last, skb);
300                 read_unlock(&ip6_ra_lock);
301                 return 1;
302         }
303         read_unlock(&ip6_ra_lock);
304         return 0;
305 }
306
307 static int ip6_forward_proxy_check(struct sk_buff *skb)
308 {
309         struct ipv6hdr *hdr = ipv6_hdr(skb);
310         u8 nexthdr = hdr->nexthdr;
311         __be16 frag_off;
312         int offset;
313
314         if (ipv6_ext_hdr(nexthdr)) {
315                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
316                 if (offset < 0)
317                         return 0;
318         } else
319                 offset = sizeof(struct ipv6hdr);
320
321         if (nexthdr == IPPROTO_ICMPV6) {
322                 struct icmp6hdr *icmp6;
323
324                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
325                                          offset + 1 - skb->data)))
326                         return 0;
327
328                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
329
330                 switch (icmp6->icmp6_type) {
331                 case NDISC_ROUTER_SOLICITATION:
332                 case NDISC_ROUTER_ADVERTISEMENT:
333                 case NDISC_NEIGHBOUR_SOLICITATION:
334                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
335                 case NDISC_REDIRECT:
336                         /* For reaction involving unicast neighbor discovery
337                          * message destined to the proxied address, pass it to
338                          * input function.
339                          */
340                         return 1;
341                 default:
342                         break;
343                 }
344         }
345
346         /*
347          * The proxying router can't forward traffic sent to a link-local
348          * address, so signal the sender and discard the packet. This
349          * behavior is clarified by the MIPv6 specification.
350          */
351         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
352                 dst_link_failure(skb);
353                 return -1;
354         }
355
356         return 0;
357 }
358
359 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
360                                      struct sk_buff *skb)
361 {
362         return dst_output(net, sk, skb);
363 }
364
365 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
366 {
367         unsigned int mtu;
368         struct inet6_dev *idev;
369
370         if (dst_metric_locked(dst, RTAX_MTU)) {
371                 mtu = dst_metric_raw(dst, RTAX_MTU);
372                 if (mtu)
373                         return mtu;
374         }
375
376         mtu = IPV6_MIN_MTU;
377         rcu_read_lock();
378         idev = __in6_dev_get(dst->dev);
379         if (idev)
380                 mtu = idev->cnf.mtu6;
381         rcu_read_unlock();
382
383         return mtu;
384 }
385
386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
387 {
388         if (skb->len <= mtu)
389                 return false;
390
391         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
392         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
393                 return true;
394
395         if (skb->ignore_df)
396                 return false;
397
398         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
399                 return false;
400
401         return true;
402 }
403
404 int ip6_forward(struct sk_buff *skb)
405 {
406         struct dst_entry *dst = skb_dst(skb);
407         struct ipv6hdr *hdr = ipv6_hdr(skb);
408         struct inet6_skb_parm *opt = IP6CB(skb);
409         struct net *net = dev_net(dst->dev);
410         u32 mtu;
411
412         if (net->ipv6.devconf_all->forwarding == 0)
413                 goto error;
414
415         if (skb->pkt_type != PACKET_HOST)
416                 goto drop;
417
418         if (unlikely(skb->sk))
419                 goto drop;
420
421         if (skb_warn_if_lro(skb))
422                 goto drop;
423
424         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
426                                 IPSTATS_MIB_INDISCARDS);
427                 goto drop;
428         }
429
430         skb_forward_csum(skb);
431
432         /*
433          *      We DO NOT make any processing on
434          *      RA packets, pushing them to user level AS IS
435          *      without ane WARRANTY that application will be able
436          *      to interpret them. The reason is that we
437          *      cannot make anything clever here.
438          *
439          *      We are not end-node, so that if packet contains
440          *      AH/ESP, we cannot make anything.
441          *      Defragmentation also would be mistake, RA packets
442          *      cannot be fragmented, because there is no warranty
443          *      that different fragments will go along one path. --ANK
444          */
445         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
446                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
447                         return 0;
448         }
449
450         /*
451          *      check and decrement ttl
452          */
453         if (hdr->hop_limit <= 1) {
454                 /* Force OUTPUT device used as source address */
455                 skb->dev = dst->dev;
456                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
457                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
458                                 IPSTATS_MIB_INHDRERRORS);
459
460                 kfree_skb(skb);
461                 return -ETIMEDOUT;
462         }
463
464         /* XXX: idev->cnf.proxy_ndp? */
465         if (net->ipv6.devconf_all->proxy_ndp &&
466             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
467                 int proxied = ip6_forward_proxy_check(skb);
468                 if (proxied > 0)
469                         return ip6_input(skb);
470                 else if (proxied < 0) {
471                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
472                                         IPSTATS_MIB_INDISCARDS);
473                         goto drop;
474                 }
475         }
476
477         if (!xfrm6_route_forward(skb)) {
478                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
479                                 IPSTATS_MIB_INDISCARDS);
480                 goto drop;
481         }
482         dst = skb_dst(skb);
483
484         /* IPv6 specs say nothing about it, but it is clear that we cannot
485            send redirects to source routed frames.
486            We don't send redirects to frames decapsulated from IPsec.
487          */
488         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
489                 struct in6_addr *target = NULL;
490                 struct inet_peer *peer;
491                 struct rt6_info *rt;
492
493                 /*
494                  *      incoming and outgoing devices are the same
495                  *      send a redirect.
496                  */
497
498                 rt = (struct rt6_info *) dst;
499                 if (rt->rt6i_flags & RTF_GATEWAY)
500                         target = &rt->rt6i_gateway;
501                 else
502                         target = &hdr->daddr;
503
504                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
505
506                 /* Limit redirects both by destination (here)
507                    and by source (inside ndisc_send_redirect)
508                  */
509                 if (inet_peer_xrlim_allow(peer, 1*HZ))
510                         ndisc_send_redirect(skb, target);
511                 if (peer)
512                         inet_putpeer(peer);
513         } else {
514                 int addrtype = ipv6_addr_type(&hdr->saddr);
515
516                 /* This check is security critical. */
517                 if (addrtype == IPV6_ADDR_ANY ||
518                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
519                         goto error;
520                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
521                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
522                                     ICMPV6_NOT_NEIGHBOUR, 0);
523                         goto error;
524                 }
525         }
526
527         mtu = ip6_dst_mtu_forward(dst);
528         if (mtu < IPV6_MIN_MTU)
529                 mtu = IPV6_MIN_MTU;
530
531         if (ip6_pkt_too_big(skb, mtu)) {
532                 /* Again, force OUTPUT device used as source address */
533                 skb->dev = dst->dev;
534                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
535                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
536                                 IPSTATS_MIB_INTOOBIGERRORS);
537                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
538                                 IPSTATS_MIB_FRAGFAILS);
539                 kfree_skb(skb);
540                 return -EMSGSIZE;
541         }
542
543         if (skb_cow(skb, dst->dev->hard_header_len)) {
544                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
545                                 IPSTATS_MIB_OUTDISCARDS);
546                 goto drop;
547         }
548
549         hdr = ipv6_hdr(skb);
550
551         /* Mangling hops number delayed to point after skb COW */
552
553         hdr->hop_limit--;
554
555         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
556         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
557         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
558                        net, NULL, skb, skb->dev, dst->dev,
559                        ip6_forward_finish);
560
561 error:
562         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
563 drop:
564         kfree_skb(skb);
565         return -EINVAL;
566 }
567
568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
569 {
570         to->pkt_type = from->pkt_type;
571         to->priority = from->priority;
572         to->protocol = from->protocol;
573         skb_dst_drop(to);
574         skb_dst_set(to, dst_clone(skb_dst(from)));
575         to->dev = from->dev;
576         to->mark = from->mark;
577
578 #ifdef CONFIG_NET_SCHED
579         to->tc_index = from->tc_index;
580 #endif
581         nf_copy(to, from);
582         skb_copy_secmark(to, from);
583 }
584
585 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
586                  int (*output)(struct net *, struct sock *, struct sk_buff *))
587 {
588         struct sk_buff *frag;
589         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
590         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
591                                 inet6_sk(skb->sk) : NULL;
592         struct ipv6hdr *tmp_hdr;
593         struct frag_hdr *fh;
594         unsigned int mtu, hlen, left, len;
595         int hroom, troom;
596         __be32 frag_id;
597         int ptr, offset = 0, err = 0;
598         u8 *prevhdr, nexthdr = 0;
599
600         err = ip6_find_1stfragopt(skb, &prevhdr);
601         if (err < 0)
602                 goto fail;
603         hlen = err;
604         nexthdr = *prevhdr;
605
606         mtu = ip6_skb_dst_mtu(skb);
607
608         /* We must not fragment if the socket is set to force MTU discovery
609          * or if the skb it not generated by a local socket.
610          */
611         if (unlikely(!skb->ignore_df && skb->len > mtu))
612                 goto fail_toobig;
613
614         if (IP6CB(skb)->frag_max_size) {
615                 if (IP6CB(skb)->frag_max_size > mtu)
616                         goto fail_toobig;
617
618                 /* don't send fragments larger than what we received */
619                 mtu = IP6CB(skb)->frag_max_size;
620                 if (mtu < IPV6_MIN_MTU)
621                         mtu = IPV6_MIN_MTU;
622         }
623
624         if (np && np->frag_size < mtu) {
625                 if (np->frag_size)
626                         mtu = np->frag_size;
627         }
628         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
629                 goto fail_toobig;
630         mtu -= hlen + sizeof(struct frag_hdr);
631
632         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
633                                     &ipv6_hdr(skb)->saddr);
634
635         if (skb->ip_summed == CHECKSUM_PARTIAL &&
636             (err = skb_checksum_help(skb)))
637                 goto fail;
638
639         hroom = LL_RESERVED_SPACE(rt->dst.dev);
640         if (skb_has_frag_list(skb)) {
641                 unsigned int first_len = skb_pagelen(skb);
642                 struct sk_buff *frag2;
643
644                 if (first_len - hlen > mtu ||
645                     ((first_len - hlen) & 7) ||
646                     skb_cloned(skb) ||
647                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
648                         goto slow_path;
649
650                 skb_walk_frags(skb, frag) {
651                         /* Correct geometry. */
652                         if (frag->len > mtu ||
653                             ((frag->len & 7) && frag->next) ||
654                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
655                                 goto slow_path_clean;
656
657                         /* Partially cloned skb? */
658                         if (skb_shared(frag))
659                                 goto slow_path_clean;
660
661                         BUG_ON(frag->sk);
662                         if (skb->sk) {
663                                 frag->sk = skb->sk;
664                                 frag->destructor = sock_wfree;
665                         }
666                         skb->truesize -= frag->truesize;
667                 }
668
669                 err = 0;
670                 offset = 0;
671                 /* BUILD HEADER */
672
673                 *prevhdr = NEXTHDR_FRAGMENT;
674                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
675                 if (!tmp_hdr) {
676                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
677                                       IPSTATS_MIB_FRAGFAILS);
678                         err = -ENOMEM;
679                         goto fail;
680                 }
681                 frag = skb_shinfo(skb)->frag_list;
682                 skb_frag_list_init(skb);
683
684                 __skb_pull(skb, hlen);
685                 fh = __skb_push(skb, sizeof(struct frag_hdr));
686                 __skb_push(skb, hlen);
687                 skb_reset_network_header(skb);
688                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
689
690                 fh->nexthdr = nexthdr;
691                 fh->reserved = 0;
692                 fh->frag_off = htons(IP6_MF);
693                 fh->identification = frag_id;
694
695                 first_len = skb_pagelen(skb);
696                 skb->data_len = first_len - skb_headlen(skb);
697                 skb->len = first_len;
698                 ipv6_hdr(skb)->payload_len = htons(first_len -
699                                                    sizeof(struct ipv6hdr));
700
701                 for (;;) {
702                         /* Prepare header of the next frame,
703                          * before previous one went down. */
704                         if (frag) {
705                                 frag->ip_summed = CHECKSUM_NONE;
706                                 skb_reset_transport_header(frag);
707                                 fh = __skb_push(frag, sizeof(struct frag_hdr));
708                                 __skb_push(frag, hlen);
709                                 skb_reset_network_header(frag);
710                                 memcpy(skb_network_header(frag), tmp_hdr,
711                                        hlen);
712                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
713                                 fh->nexthdr = nexthdr;
714                                 fh->reserved = 0;
715                                 fh->frag_off = htons(offset);
716                                 if (frag->next)
717                                         fh->frag_off |= htons(IP6_MF);
718                                 fh->identification = frag_id;
719                                 ipv6_hdr(frag)->payload_len =
720                                                 htons(frag->len -
721                                                       sizeof(struct ipv6hdr));
722                                 ip6_copy_metadata(frag, skb);
723                         }
724
725                         err = output(net, sk, skb);
726                         if (!err)
727                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
728                                               IPSTATS_MIB_FRAGCREATES);
729
730                         if (err || !frag)
731                                 break;
732
733                         skb = frag;
734                         frag = skb->next;
735                         skb->next = NULL;
736                 }
737
738                 kfree(tmp_hdr);
739
740                 if (err == 0) {
741                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
742                                       IPSTATS_MIB_FRAGOKS);
743                         return 0;
744                 }
745
746                 kfree_skb_list(frag);
747
748                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749                               IPSTATS_MIB_FRAGFAILS);
750                 return err;
751
752 slow_path_clean:
753                 skb_walk_frags(skb, frag2) {
754                         if (frag2 == frag)
755                                 break;
756                         frag2->sk = NULL;
757                         frag2->destructor = NULL;
758                         skb->truesize += frag2->truesize;
759                 }
760         }
761
762 slow_path:
763         left = skb->len - hlen;         /* Space per frame */
764         ptr = hlen;                     /* Where to start from */
765
766         /*
767          *      Fragment the datagram.
768          */
769
770         troom = rt->dst.dev->needed_tailroom;
771
772         /*
773          *      Keep copying data until we run out.
774          */
775         while (left > 0)        {
776                 u8 *fragnexthdr_offset;
777
778                 len = left;
779                 /* IF: it doesn't fit, use 'mtu' - the data space left */
780                 if (len > mtu)
781                         len = mtu;
782                 /* IF: we are not sending up to and including the packet end
783                    then align the next start on an eight byte boundary */
784                 if (len < left) {
785                         len &= ~7;
786                 }
787
788                 /* Allocate buffer */
789                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
790                                  hroom + troom, GFP_ATOMIC);
791                 if (!frag) {
792                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
793                                       IPSTATS_MIB_FRAGFAILS);
794                         err = -ENOMEM;
795                         goto fail;
796                 }
797
798                 /*
799                  *      Set up data on packet
800                  */
801
802                 ip6_copy_metadata(frag, skb);
803                 skb_reserve(frag, hroom);
804                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
805                 skb_reset_network_header(frag);
806                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
807                 frag->transport_header = (frag->network_header + hlen +
808                                           sizeof(struct frag_hdr));
809
810                 /*
811                  *      Charge the memory for the fragment to any owner
812                  *      it might possess
813                  */
814                 if (skb->sk)
815                         skb_set_owner_w(frag, skb->sk);
816
817                 /*
818                  *      Copy the packet header into the new buffer.
819                  */
820                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
821
822                 fragnexthdr_offset = skb_network_header(frag);
823                 fragnexthdr_offset += prevhdr - skb_network_header(skb);
824                 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
825
826                 /*
827                  *      Build fragment header.
828                  */
829                 fh->nexthdr = nexthdr;
830                 fh->reserved = 0;
831                 fh->identification = frag_id;
832
833                 /*
834                  *      Copy a block of the IP datagram.
835                  */
836                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
837                                      len));
838                 left -= len;
839
840                 fh->frag_off = htons(offset);
841                 if (left > 0)
842                         fh->frag_off |= htons(IP6_MF);
843                 ipv6_hdr(frag)->payload_len = htons(frag->len -
844                                                     sizeof(struct ipv6hdr));
845
846                 ptr += len;
847                 offset += len;
848
849                 /*
850                  *      Put this fragment into the sending queue.
851                  */
852                 err = output(net, sk, frag);
853                 if (err)
854                         goto fail;
855
856                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
857                               IPSTATS_MIB_FRAGCREATES);
858         }
859         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
860                       IPSTATS_MIB_FRAGOKS);
861         consume_skb(skb);
862         return err;
863
864 fail_toobig:
865         if (skb->sk && dst_allfrag(skb_dst(skb)))
866                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
867
868         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
869         err = -EMSGSIZE;
870
871 fail:
872         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873                       IPSTATS_MIB_FRAGFAILS);
874         kfree_skb(skb);
875         return err;
876 }
877
878 static inline int ip6_rt_check(const struct rt6key *rt_key,
879                                const struct in6_addr *fl_addr,
880                                const struct in6_addr *addr_cache)
881 {
882         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
883                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
884 }
885
886 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
887                                           struct dst_entry *dst,
888                                           const struct flowi6 *fl6)
889 {
890         struct ipv6_pinfo *np = inet6_sk(sk);
891         struct rt6_info *rt;
892
893         if (!dst)
894                 goto out;
895
896         if (dst->ops->family != AF_INET6) {
897                 dst_release(dst);
898                 return NULL;
899         }
900
901         rt = (struct rt6_info *)dst;
902         /* Yes, checking route validity in not connected
903          * case is not very simple. Take into account,
904          * that we do not support routing by source, TOS,
905          * and MSG_DONTROUTE            --ANK (980726)
906          *
907          * 1. ip6_rt_check(): If route was host route,
908          *    check that cached destination is current.
909          *    If it is network route, we still may
910          *    check its validity using saved pointer
911          *    to the last used address: daddr_cache.
912          *    We do not want to save whole address now,
913          *    (because main consumer of this service
914          *    is tcp, which has not this problem),
915          *    so that the last trick works only on connected
916          *    sockets.
917          * 2. oif also should be the same.
918          */
919         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
920 #ifdef CONFIG_IPV6_SUBTREES
921             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
922 #endif
923            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
924               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
925                 dst_release(dst);
926                 dst = NULL;
927         }
928
929 out:
930         return dst;
931 }
932
933 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
934                                struct dst_entry **dst, struct flowi6 *fl6)
935 {
936 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
937         struct neighbour *n;
938         struct rt6_info *rt;
939 #endif
940         int err;
941         int flags = 0;
942
943         /* The correct way to handle this would be to do
944          * ip6_route_get_saddr, and then ip6_route_output; however,
945          * the route-specific preferred source forces the
946          * ip6_route_output call _before_ ip6_route_get_saddr.
947          *
948          * In source specific routing (no src=any default route),
949          * ip6_route_output will fail given src=any saddr, though, so
950          * that's why we try it again later.
951          */
952         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
953                 struct rt6_info *rt;
954                 bool had_dst = *dst != NULL;
955
956                 if (!had_dst)
957                         *dst = ip6_route_output(net, sk, fl6);
958                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
959                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
960                                           sk ? inet6_sk(sk)->srcprefs : 0,
961                                           &fl6->saddr);
962                 if (err)
963                         goto out_err_release;
964
965                 /* If we had an erroneous initial result, pretend it
966                  * never existed and let the SA-enabled version take
967                  * over.
968                  */
969                 if (!had_dst && (*dst)->error) {
970                         dst_release(*dst);
971                         *dst = NULL;
972                 }
973
974                 if (fl6->flowi6_oif)
975                         flags |= RT6_LOOKUP_F_IFACE;
976         }
977
978         if (!*dst)
979                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
980
981         err = (*dst)->error;
982         if (err)
983                 goto out_err_release;
984
985 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
986         /*
987          * Here if the dst entry we've looked up
988          * has a neighbour entry that is in the INCOMPLETE
989          * state and the src address from the flow is
990          * marked as OPTIMISTIC, we release the found
991          * dst entry and replace it instead with the
992          * dst entry of the nexthop router
993          */
994         rt = (struct rt6_info *) *dst;
995         rcu_read_lock_bh();
996         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
997                                       rt6_nexthop(rt, &fl6->daddr));
998         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
999         rcu_read_unlock_bh();
1000
1001         if (err) {
1002                 struct inet6_ifaddr *ifp;
1003                 struct flowi6 fl_gw6;
1004                 int redirect;
1005
1006                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1007                                       (*dst)->dev, 1);
1008
1009                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1010                 if (ifp)
1011                         in6_ifa_put(ifp);
1012
1013                 if (redirect) {
1014                         /*
1015                          * We need to get the dst entry for the
1016                          * default router instead
1017                          */
1018                         dst_release(*dst);
1019                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1020                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1021                         *dst = ip6_route_output(net, sk, &fl_gw6);
1022                         err = (*dst)->error;
1023                         if (err)
1024                                 goto out_err_release;
1025                 }
1026         }
1027 #endif
1028         if (ipv6_addr_v4mapped(&fl6->saddr) &&
1029             !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1030                 err = -EAFNOSUPPORT;
1031                 goto out_err_release;
1032         }
1033
1034         return 0;
1035
1036 out_err_release:
1037         dst_release(*dst);
1038         *dst = NULL;
1039
1040         if (err == -ENETUNREACH)
1041                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1042         return err;
1043 }
1044
1045 /**
1046  *      ip6_dst_lookup - perform route lookup on flow
1047  *      @sk: socket which provides route info
1048  *      @dst: pointer to dst_entry * for result
1049  *      @fl6: flow to lookup
1050  *
1051  *      This function performs a route lookup on the given flow.
1052  *
1053  *      It returns zero on success, or a standard errno code on error.
1054  */
1055 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1056                    struct flowi6 *fl6)
1057 {
1058         *dst = NULL;
1059         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1060 }
1061 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1062
1063 /**
1064  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1065  *      @sk: socket which provides route info
1066  *      @fl6: flow to lookup
1067  *      @final_dst: final destination address for ipsec lookup
1068  *
1069  *      This function performs a route lookup on the given flow.
1070  *
1071  *      It returns a valid dst pointer on success, or a pointer encoded
1072  *      error code.
1073  */
1074 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1075                                       const struct in6_addr *final_dst)
1076 {
1077         struct dst_entry *dst = NULL;
1078         int err;
1079
1080         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1081         if (err)
1082                 return ERR_PTR(err);
1083         if (final_dst)
1084                 fl6->daddr = *final_dst;
1085
1086         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1087 }
1088 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1089
1090 /**
1091  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1092  *      @sk: socket which provides the dst cache and route info
1093  *      @fl6: flow to lookup
1094  *      @final_dst: final destination address for ipsec lookup
1095  *
1096  *      This function performs a route lookup on the given flow with the
1097  *      possibility of using the cached route in the socket if it is valid.
1098  *      It will take the socket dst lock when operating on the dst cache.
1099  *      As a result, this function can only be used in process context.
1100  *
1101  *      It returns a valid dst pointer on success, or a pointer encoded
1102  *      error code.
1103  */
1104 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1105                                          const struct in6_addr *final_dst)
1106 {
1107         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1108
1109         dst = ip6_sk_dst_check(sk, dst, fl6);
1110         if (!dst)
1111                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1112
1113         return dst;
1114 }
1115 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1116
1117 static inline int ip6_ufo_append_data(struct sock *sk,
1118                         struct sk_buff_head *queue,
1119                         int getfrag(void *from, char *to, int offset, int len,
1120                         int odd, struct sk_buff *skb),
1121                         void *from, int length, int hh_len, int fragheaderlen,
1122                         int exthdrlen, int transhdrlen, int mtu,
1123                         unsigned int flags, const struct flowi6 *fl6)
1124
1125 {
1126         struct sk_buff *skb;
1127         int err;
1128
1129         /* There is support for UDP large send offload by network
1130          * device, so create one single skb packet containing complete
1131          * udp datagram
1132          */
1133         skb = skb_peek_tail(queue);
1134         if (!skb) {
1135                 skb = sock_alloc_send_skb(sk,
1136                         hh_len + fragheaderlen + transhdrlen + 20,
1137                         (flags & MSG_DONTWAIT), &err);
1138                 if (!skb)
1139                         return err;
1140
1141                 /* reserve space for Hardware header */
1142                 skb_reserve(skb, hh_len);
1143
1144                 /* create space for UDP/IP header */
1145                 skb_put(skb, fragheaderlen + transhdrlen);
1146
1147                 /* initialize network header pointer */
1148                 skb_set_network_header(skb, exthdrlen);
1149
1150                 /* initialize protocol header pointer */
1151                 skb->transport_header = skb->network_header + fragheaderlen;
1152
1153                 skb->protocol = htons(ETH_P_IPV6);
1154                 skb->csum = 0;
1155
1156                 if (flags & MSG_CONFIRM)
1157                         skb_set_dst_pending_confirm(skb, 1);
1158
1159                 __skb_queue_tail(queue, skb);
1160         } else if (skb_is_gso(skb)) {
1161                 goto append;
1162         }
1163
1164         skb->ip_summed = CHECKSUM_PARTIAL;
1165         /* Specify the length of each IPv6 datagram fragment.
1166          * It has to be a multiple of 8.
1167          */
1168         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1169                                      sizeof(struct frag_hdr)) & ~7;
1170         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1171         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1172                                                          &fl6->daddr,
1173                                                          &fl6->saddr);
1174
1175 append:
1176         return skb_append_datato_frags(sk, skb, getfrag, from,
1177                                        (length - transhdrlen));
1178 }
1179
1180 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1181                                                gfp_t gfp)
1182 {
1183         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1184 }
1185
1186 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1187                                                 gfp_t gfp)
1188 {
1189         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1190 }
1191
1192 static void ip6_append_data_mtu(unsigned int *mtu,
1193                                 int *maxfraglen,
1194                                 unsigned int fragheaderlen,
1195                                 struct sk_buff *skb,
1196                                 struct rt6_info *rt,
1197                                 unsigned int orig_mtu)
1198 {
1199         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1200                 if (!skb) {
1201                         /* first fragment, reserve header_len */
1202                         *mtu = orig_mtu - rt->dst.header_len;
1203
1204                 } else {
1205                         /*
1206                          * this fragment is not first, the headers
1207                          * space is regarded as data space.
1208                          */
1209                         *mtu = orig_mtu;
1210                 }
1211                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1212                               + fragheaderlen - sizeof(struct frag_hdr);
1213         }
1214 }
1215
1216 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1217                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1218                           struct rt6_info *rt, struct flowi6 *fl6)
1219 {
1220         struct ipv6_pinfo *np = inet6_sk(sk);
1221         unsigned int mtu;
1222         struct ipv6_txoptions *opt = ipc6->opt;
1223
1224         /*
1225          * setup for corking
1226          */
1227         if (opt) {
1228                 if (WARN_ON(v6_cork->opt))
1229                         return -EINVAL;
1230
1231                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1232                 if (unlikely(!v6_cork->opt))
1233                         return -ENOBUFS;
1234
1235                 v6_cork->opt->tot_len = opt->tot_len;
1236                 v6_cork->opt->opt_flen = opt->opt_flen;
1237                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1238
1239                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1240                                                     sk->sk_allocation);
1241                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1242                         return -ENOBUFS;
1243
1244                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1245                                                     sk->sk_allocation);
1246                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1247                         return -ENOBUFS;
1248
1249                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1250                                                    sk->sk_allocation);
1251                 if (opt->hopopt && !v6_cork->opt->hopopt)
1252                         return -ENOBUFS;
1253
1254                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1255                                                     sk->sk_allocation);
1256                 if (opt->srcrt && !v6_cork->opt->srcrt)
1257                         return -ENOBUFS;
1258
1259                 /* need source address above miyazawa*/
1260         }
1261         dst_hold(&rt->dst);
1262         cork->base.dst = &rt->dst;
1263         cork->fl.u.ip6 = *fl6;
1264         v6_cork->hop_limit = ipc6->hlimit;
1265         v6_cork->tclass = ipc6->tclass;
1266         if (rt->dst.flags & DST_XFRM_TUNNEL)
1267                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1268                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1269         else
1270                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1271                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1272         if (np->frag_size < mtu) {
1273                 if (np->frag_size)
1274                         mtu = np->frag_size;
1275         }
1276         cork->base.fragsize = mtu;
1277         if (dst_allfrag(rt->dst.path))
1278                 cork->base.flags |= IPCORK_ALLFRAG;
1279         cork->base.length = 0;
1280
1281         return 0;
1282 }
1283
1284 static int __ip6_append_data(struct sock *sk,
1285                              struct flowi6 *fl6,
1286                              struct sk_buff_head *queue,
1287                              struct inet_cork *cork,
1288                              struct inet6_cork *v6_cork,
1289                              struct page_frag *pfrag,
1290                              int getfrag(void *from, char *to, int offset,
1291                                          int len, int odd, struct sk_buff *skb),
1292                              void *from, int length, int transhdrlen,
1293                              unsigned int flags, struct ipcm6_cookie *ipc6,
1294                              const struct sockcm_cookie *sockc)
1295 {
1296         struct sk_buff *skb, *skb_prev = NULL;
1297         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1298         int exthdrlen = 0;
1299         int dst_exthdrlen = 0;
1300         int hh_len;
1301         int copy;
1302         int err;
1303         int offset = 0;
1304         __u8 tx_flags = 0;
1305         u32 tskey = 0;
1306         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1307         struct ipv6_txoptions *opt = v6_cork->opt;
1308         int csummode = CHECKSUM_NONE;
1309         unsigned int maxnonfragsize, headersize;
1310
1311         skb = skb_peek_tail(queue);
1312         if (!skb) {
1313                 exthdrlen = opt ? opt->opt_flen : 0;
1314                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1315         }
1316
1317         mtu = cork->fragsize;
1318         orig_mtu = mtu;
1319
1320         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1321
1322         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1323                         (opt ? opt->opt_nflen : 0);
1324         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1325                      sizeof(struct frag_hdr);
1326
1327         headersize = sizeof(struct ipv6hdr) +
1328                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1329                      (dst_allfrag(&rt->dst) ?
1330                       sizeof(struct frag_hdr) : 0) +
1331                      rt->rt6i_nfheader_len;
1332
1333         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1334             (sk->sk_protocol == IPPROTO_UDP ||
1335              sk->sk_protocol == IPPROTO_RAW)) {
1336                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1337                                 sizeof(struct ipv6hdr));
1338                 goto emsgsize;
1339         }
1340
1341         if (ip6_sk_ignore_df(sk))
1342                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1343         else
1344                 maxnonfragsize = mtu;
1345
1346         if (cork->length + length > maxnonfragsize - headersize) {
1347 emsgsize:
1348                 ipv6_local_error(sk, EMSGSIZE, fl6,
1349                                  mtu - headersize +
1350                                  sizeof(struct ipv6hdr));
1351                 return -EMSGSIZE;
1352         }
1353
1354         /* CHECKSUM_PARTIAL only with no extension headers and when
1355          * we are not going to fragment
1356          */
1357         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1358             headersize == sizeof(struct ipv6hdr) &&
1359             length <= mtu - headersize &&
1360             !(flags & MSG_MORE) &&
1361             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1362                 csummode = CHECKSUM_PARTIAL;
1363
1364         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1365                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1366                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1367                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1368                         tskey = sk->sk_tskey++;
1369         }
1370
1371         /*
1372          * Let's try using as much space as possible.
1373          * Use MTU if total length of the message fits into the MTU.
1374          * Otherwise, we need to reserve fragment header and
1375          * fragment alignment (= 8-15 octects, in total).
1376          *
1377          * Note that we may need to "move" the data from the tail of
1378          * of the buffer to the new fragment when we split
1379          * the message.
1380          *
1381          * FIXME: It may be fragmented into multiple chunks
1382          *        at once if non-fragmentable extension headers
1383          *        are too large.
1384          * --yoshfuji
1385          */
1386
1387         cork->length += length;
1388         if ((((length + (skb ? skb->len : headersize)) > mtu) ||
1389              (skb && skb_is_gso(skb))) &&
1390             (sk->sk_protocol == IPPROTO_UDP) &&
1391             (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1392             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1393                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1394                                           hh_len, fragheaderlen, exthdrlen,
1395                                           transhdrlen, mtu, flags, fl6);
1396                 if (err)
1397                         goto error;
1398                 return 0;
1399         }
1400
1401         if (!skb)
1402                 goto alloc_new_skb;
1403
1404         while (length > 0) {
1405                 /* Check if the remaining data fits into current packet. */
1406                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1407                 if (copy < length)
1408                         copy = maxfraglen - skb->len;
1409
1410                 if (copy <= 0) {
1411                         char *data;
1412                         unsigned int datalen;
1413                         unsigned int fraglen;
1414                         unsigned int fraggap;
1415                         unsigned int alloclen;
1416 alloc_new_skb:
1417                         /* There's no room in the current skb */
1418                         if (skb)
1419                                 fraggap = skb->len - maxfraglen;
1420                         else
1421                                 fraggap = 0;
1422                         /* update mtu and maxfraglen if necessary */
1423                         if (!skb || !skb_prev)
1424                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1425                                                     fragheaderlen, skb, rt,
1426                                                     orig_mtu);
1427
1428                         skb_prev = skb;
1429
1430                         /*
1431                          * If remaining data exceeds the mtu,
1432                          * we know we need more fragment(s).
1433                          */
1434                         datalen = length + fraggap;
1435
1436                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1437                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1438                         if ((flags & MSG_MORE) &&
1439                             !(rt->dst.dev->features&NETIF_F_SG))
1440                                 alloclen = mtu;
1441                         else
1442                                 alloclen = datalen + fragheaderlen;
1443
1444                         alloclen += dst_exthdrlen;
1445
1446                         if (datalen != length + fraggap) {
1447                                 /*
1448                                  * this is not the last fragment, the trailer
1449                                  * space is regarded as data space.
1450                                  */
1451                                 datalen += rt->dst.trailer_len;
1452                         }
1453
1454                         alloclen += rt->dst.trailer_len;
1455                         fraglen = datalen + fragheaderlen;
1456
1457                         /*
1458                          * We just reserve space for fragment header.
1459                          * Note: this may be overallocation if the message
1460                          * (without MSG_MORE) fits into the MTU.
1461                          */
1462                         alloclen += sizeof(struct frag_hdr);
1463
1464                         copy = datalen - transhdrlen - fraggap;
1465                         if (copy < 0) {
1466                                 err = -EINVAL;
1467                                 goto error;
1468                         }
1469                         if (transhdrlen) {
1470                                 skb = sock_alloc_send_skb(sk,
1471                                                 alloclen + hh_len,
1472                                                 (flags & MSG_DONTWAIT), &err);
1473                         } else {
1474                                 skb = NULL;
1475                                 if (refcount_read(&sk->sk_wmem_alloc) <=
1476                                     2 * sk->sk_sndbuf)
1477                                         skb = sock_wmalloc(sk,
1478                                                            alloclen + hh_len, 1,
1479                                                            sk->sk_allocation);
1480                                 if (unlikely(!skb))
1481                                         err = -ENOBUFS;
1482                         }
1483                         if (!skb)
1484                                 goto error;
1485                         /*
1486                          *      Fill in the control structures
1487                          */
1488                         skb->protocol = htons(ETH_P_IPV6);
1489                         skb->ip_summed = csummode;
1490                         skb->csum = 0;
1491                         /* reserve for fragmentation and ipsec header */
1492                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1493                                     dst_exthdrlen);
1494
1495                         /* Only the initial fragment is time stamped */
1496                         skb_shinfo(skb)->tx_flags = tx_flags;
1497                         tx_flags = 0;
1498                         skb_shinfo(skb)->tskey = tskey;
1499                         tskey = 0;
1500
1501                         /*
1502                          *      Find where to start putting bytes
1503                          */
1504                         data = skb_put(skb, fraglen);
1505                         skb_set_network_header(skb, exthdrlen);
1506                         data += fragheaderlen;
1507                         skb->transport_header = (skb->network_header +
1508                                                  fragheaderlen);
1509                         if (fraggap) {
1510                                 skb->csum = skb_copy_and_csum_bits(
1511                                         skb_prev, maxfraglen,
1512                                         data + transhdrlen, fraggap, 0);
1513                                 skb_prev->csum = csum_sub(skb_prev->csum,
1514                                                           skb->csum);
1515                                 data += fraggap;
1516                                 pskb_trim_unique(skb_prev, maxfraglen);
1517                         }
1518                         if (copy > 0 &&
1519                             getfrag(from, data + transhdrlen, offset,
1520                                     copy, fraggap, skb) < 0) {
1521                                 err = -EFAULT;
1522                                 kfree_skb(skb);
1523                                 goto error;
1524                         }
1525
1526                         offset += copy;
1527                         length -= datalen - fraggap;
1528                         transhdrlen = 0;
1529                         exthdrlen = 0;
1530                         dst_exthdrlen = 0;
1531
1532                         if ((flags & MSG_CONFIRM) && !skb_prev)
1533                                 skb_set_dst_pending_confirm(skb, 1);
1534
1535                         /*
1536                          * Put the packet on the pending queue
1537                          */
1538                         __skb_queue_tail(queue, skb);
1539                         continue;
1540                 }
1541
1542                 if (copy > length)
1543                         copy = length;
1544
1545                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1546                         unsigned int off;
1547
1548                         off = skb->len;
1549                         if (getfrag(from, skb_put(skb, copy),
1550                                                 offset, copy, off, skb) < 0) {
1551                                 __skb_trim(skb, off);
1552                                 err = -EFAULT;
1553                                 goto error;
1554                         }
1555                 } else {
1556                         int i = skb_shinfo(skb)->nr_frags;
1557
1558                         err = -ENOMEM;
1559                         if (!sk_page_frag_refill(sk, pfrag))
1560                                 goto error;
1561
1562                         if (!skb_can_coalesce(skb, i, pfrag->page,
1563                                               pfrag->offset)) {
1564                                 err = -EMSGSIZE;
1565                                 if (i == MAX_SKB_FRAGS)
1566                                         goto error;
1567
1568                                 __skb_fill_page_desc(skb, i, pfrag->page,
1569                                                      pfrag->offset, 0);
1570                                 skb_shinfo(skb)->nr_frags = ++i;
1571                                 get_page(pfrag->page);
1572                         }
1573                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1574                         if (getfrag(from,
1575                                     page_address(pfrag->page) + pfrag->offset,
1576                                     offset, copy, skb->len, skb) < 0)
1577                                 goto error_efault;
1578
1579                         pfrag->offset += copy;
1580                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1581                         skb->len += copy;
1582                         skb->data_len += copy;
1583                         skb->truesize += copy;
1584                         refcount_add(copy, &sk->sk_wmem_alloc);
1585                 }
1586                 offset += copy;
1587                 length -= copy;
1588         }
1589
1590         return 0;
1591
1592 error_efault:
1593         err = -EFAULT;
1594 error:
1595         cork->length -= length;
1596         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1597         return err;
1598 }
1599
1600 int ip6_append_data(struct sock *sk,
1601                     int getfrag(void *from, char *to, int offset, int len,
1602                                 int odd, struct sk_buff *skb),
1603                     void *from, int length, int transhdrlen,
1604                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1605                     struct rt6_info *rt, unsigned int flags,
1606                     const struct sockcm_cookie *sockc)
1607 {
1608         struct inet_sock *inet = inet_sk(sk);
1609         struct ipv6_pinfo *np = inet6_sk(sk);
1610         int exthdrlen;
1611         int err;
1612
1613         if (flags&MSG_PROBE)
1614                 return 0;
1615         if (skb_queue_empty(&sk->sk_write_queue)) {
1616                 /*
1617                  * setup for corking
1618                  */
1619                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1620                                      ipc6, rt, fl6);
1621                 if (err)
1622                         return err;
1623
1624                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1625                 length += exthdrlen;
1626                 transhdrlen += exthdrlen;
1627         } else {
1628                 fl6 = &inet->cork.fl.u.ip6;
1629                 transhdrlen = 0;
1630         }
1631
1632         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1633                                  &np->cork, sk_page_frag(sk), getfrag,
1634                                  from, length, transhdrlen, flags, ipc6, sockc);
1635 }
1636 EXPORT_SYMBOL_GPL(ip6_append_data);
1637
1638 static void ip6_cork_release(struct inet_cork_full *cork,
1639                              struct inet6_cork *v6_cork)
1640 {
1641         if (v6_cork->opt) {
1642                 kfree(v6_cork->opt->dst0opt);
1643                 kfree(v6_cork->opt->dst1opt);
1644                 kfree(v6_cork->opt->hopopt);
1645                 kfree(v6_cork->opt->srcrt);
1646                 kfree(v6_cork->opt);
1647                 v6_cork->opt = NULL;
1648         }
1649
1650         if (cork->base.dst) {
1651                 dst_release(cork->base.dst);
1652                 cork->base.dst = NULL;
1653                 cork->base.flags &= ~IPCORK_ALLFRAG;
1654         }
1655         memset(&cork->fl, 0, sizeof(cork->fl));
1656 }
1657
1658 struct sk_buff *__ip6_make_skb(struct sock *sk,
1659                                struct sk_buff_head *queue,
1660                                struct inet_cork_full *cork,
1661                                struct inet6_cork *v6_cork)
1662 {
1663         struct sk_buff *skb, *tmp_skb;
1664         struct sk_buff **tail_skb;
1665         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1666         struct ipv6_pinfo *np = inet6_sk(sk);
1667         struct net *net = sock_net(sk);
1668         struct ipv6hdr *hdr;
1669         struct ipv6_txoptions *opt = v6_cork->opt;
1670         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1671         struct flowi6 *fl6 = &cork->fl.u.ip6;
1672         unsigned char proto = fl6->flowi6_proto;
1673
1674         skb = __skb_dequeue(queue);
1675         if (!skb)
1676                 goto out;
1677         tail_skb = &(skb_shinfo(skb)->frag_list);
1678
1679         /* move skb->data to ip header from ext header */
1680         if (skb->data < skb_network_header(skb))
1681                 __skb_pull(skb, skb_network_offset(skb));
1682         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1683                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1684                 *tail_skb = tmp_skb;
1685                 tail_skb = &(tmp_skb->next);
1686                 skb->len += tmp_skb->len;
1687                 skb->data_len += tmp_skb->len;
1688                 skb->truesize += tmp_skb->truesize;
1689                 tmp_skb->destructor = NULL;
1690                 tmp_skb->sk = NULL;
1691         }
1692
1693         /* Allow local fragmentation. */
1694         skb->ignore_df = ip6_sk_ignore_df(sk);
1695
1696         *final_dst = fl6->daddr;
1697         __skb_pull(skb, skb_network_header_len(skb));
1698         if (opt && opt->opt_flen)
1699                 ipv6_push_frag_opts(skb, opt, &proto);
1700         if (opt && opt->opt_nflen)
1701                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1702
1703         skb_push(skb, sizeof(struct ipv6hdr));
1704         skb_reset_network_header(skb);
1705         hdr = ipv6_hdr(skb);
1706
1707         ip6_flow_hdr(hdr, v6_cork->tclass,
1708                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1709                                         np->autoflowlabel, fl6));
1710         hdr->hop_limit = v6_cork->hop_limit;
1711         hdr->nexthdr = proto;
1712         hdr->saddr = fl6->saddr;
1713         hdr->daddr = *final_dst;
1714
1715         skb->priority = sk->sk_priority;
1716         skb->mark = sk->sk_mark;
1717
1718         skb_dst_set(skb, dst_clone(&rt->dst));
1719         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1720         if (proto == IPPROTO_ICMPV6) {
1721                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1722
1723                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1724                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1725         }
1726
1727         ip6_cork_release(cork, v6_cork);
1728 out:
1729         return skb;
1730 }
1731
1732 int ip6_send_skb(struct sk_buff *skb)
1733 {
1734         struct net *net = sock_net(skb->sk);
1735         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1736         int err;
1737
1738         err = ip6_local_out(net, skb->sk, skb);
1739         if (err) {
1740                 if (err > 0)
1741                         err = net_xmit_errno(err);
1742                 if (err)
1743                         IP6_INC_STATS(net, rt->rt6i_idev,
1744                                       IPSTATS_MIB_OUTDISCARDS);
1745         }
1746
1747         return err;
1748 }
1749
1750 int ip6_push_pending_frames(struct sock *sk)
1751 {
1752         struct sk_buff *skb;
1753
1754         skb = ip6_finish_skb(sk);
1755         if (!skb)
1756                 return 0;
1757
1758         return ip6_send_skb(skb);
1759 }
1760 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1761
1762 static void __ip6_flush_pending_frames(struct sock *sk,
1763                                        struct sk_buff_head *queue,
1764                                        struct inet_cork_full *cork,
1765                                        struct inet6_cork *v6_cork)
1766 {
1767         struct sk_buff *skb;
1768
1769         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1770                 if (skb_dst(skb))
1771                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1772                                       IPSTATS_MIB_OUTDISCARDS);
1773                 kfree_skb(skb);
1774         }
1775
1776         ip6_cork_release(cork, v6_cork);
1777 }
1778
1779 void ip6_flush_pending_frames(struct sock *sk)
1780 {
1781         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1782                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1783 }
1784 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1785
1786 struct sk_buff *ip6_make_skb(struct sock *sk,
1787                              int getfrag(void *from, char *to, int offset,
1788                                          int len, int odd, struct sk_buff *skb),
1789                              void *from, int length, int transhdrlen,
1790                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1791                              struct rt6_info *rt, unsigned int flags,
1792                              const struct sockcm_cookie *sockc)
1793 {
1794         struct inet_cork_full cork;
1795         struct inet6_cork v6_cork;
1796         struct sk_buff_head queue;
1797         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1798         int err;
1799
1800         if (flags & MSG_PROBE)
1801                 return NULL;
1802
1803         __skb_queue_head_init(&queue);
1804
1805         cork.base.flags = 0;
1806         cork.base.addr = 0;
1807         cork.base.opt = NULL;
1808         v6_cork.opt = NULL;
1809         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1810         if (err)
1811                 return ERR_PTR(err);
1812
1813         if (ipc6->dontfrag < 0)
1814                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1815
1816         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1817                                 &current->task_frag, getfrag, from,
1818                                 length + exthdrlen, transhdrlen + exthdrlen,
1819                                 flags, ipc6, sockc);
1820         if (err) {
1821                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1822                 return ERR_PTR(err);
1823         }
1824
1825         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1826 }