]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv6/ip6_output.c
Merge tag 'ceph-for-4.9-rc1' of git://github.com/ceph/ceph-client
[karo-tx-linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
59 #include <net/lwtunnel.h>
60
61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
62 {
63         struct dst_entry *dst = skb_dst(skb);
64         struct net_device *dev = dst->dev;
65         struct neighbour *neigh;
66         struct in6_addr *nexthop;
67         int ret;
68
69         skb->protocol = htons(ETH_P_IPV6);
70         skb->dev = dev;
71
72         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
73                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
74
75                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
76                     ((mroute6_socket(net, skb) &&
77                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
78                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
79                                          &ipv6_hdr(skb)->saddr))) {
80                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
81
82                         /* Do not check for IFF_ALLMULTI; multicast routing
83                            is not supported in any case.
84                          */
85                         if (newskb)
86                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
87                                         net, sk, newskb, NULL, newskb->dev,
88                                         dev_loopback_xmit);
89
90                         if (ipv6_hdr(skb)->hop_limit == 0) {
91                                 IP6_INC_STATS(net, idev,
92                                               IPSTATS_MIB_OUTDISCARDS);
93                                 kfree_skb(skb);
94                                 return 0;
95                         }
96                 }
97
98                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
99
100                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
101                     IPV6_ADDR_SCOPE_NODELOCAL &&
102                     !(dev->flags & IFF_LOOPBACK)) {
103                         kfree_skb(skb);
104                         return 0;
105                 }
106         }
107
108         if (lwtunnel_xmit_redirect(dst->lwtstate)) {
109                 int res = lwtunnel_xmit(skb);
110
111                 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
112                         return res;
113         }
114
115         rcu_read_lock_bh();
116         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
117         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
118         if (unlikely(!neigh))
119                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
120         if (!IS_ERR(neigh)) {
121                 ret = dst_neigh_output(dst, neigh, skb);
122                 rcu_read_unlock_bh();
123                 return ret;
124         }
125         rcu_read_unlock_bh();
126
127         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
128         kfree_skb(skb);
129         return -EINVAL;
130 }
131
132 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133 {
134         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
135             dst_allfrag(skb_dst(skb)) ||
136             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
137                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
138         else
139                 return ip6_finish_output2(net, sk, skb);
140 }
141
142 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
143 {
144         struct net_device *dev = skb_dst(skb)->dev;
145         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
146
147         if (unlikely(idev->cnf.disable_ipv6)) {
148                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
149                 kfree_skb(skb);
150                 return 0;
151         }
152
153         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
154                             net, sk, skb, NULL, dev,
155                             ip6_finish_output,
156                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
157 }
158
159 /*
160  * xmit an sk_buff (used by TCP, SCTP and DCCP)
161  * Note : socket lock is not held for SYNACK packets, but might be modified
162  * by calls to skb_set_owner_w() and ipv6_local_error(),
163  * which are using proper atomic operations or spinlocks.
164  */
165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
166              struct ipv6_txoptions *opt, int tclass)
167 {
168         struct net *net = sock_net(sk);
169         const struct ipv6_pinfo *np = inet6_sk(sk);
170         struct in6_addr *first_hop = &fl6->daddr;
171         struct dst_entry *dst = skb_dst(skb);
172         struct ipv6hdr *hdr;
173         u8  proto = fl6->flowi6_proto;
174         int seg_len = skb->len;
175         int hlimit = -1;
176         u32 mtu;
177
178         if (opt) {
179                 unsigned int head_room;
180
181                 /* First: exthdrs may take lots of space (~8K for now)
182                    MAX_HEADER is not enough.
183                  */
184                 head_room = opt->opt_nflen + opt->opt_flen;
185                 seg_len += head_room;
186                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
187
188                 if (skb_headroom(skb) < head_room) {
189                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
190                         if (!skb2) {
191                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
192                                               IPSTATS_MIB_OUTDISCARDS);
193                                 kfree_skb(skb);
194                                 return -ENOBUFS;
195                         }
196                         consume_skb(skb);
197                         skb = skb2;
198                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
199                          * it is safe to call in our context (socket lock not held)
200                          */
201                         skb_set_owner_w(skb, (struct sock *)sk);
202                 }
203                 if (opt->opt_flen)
204                         ipv6_push_frag_opts(skb, opt, &proto);
205                 if (opt->opt_nflen)
206                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
207         }
208
209         skb_push(skb, sizeof(struct ipv6hdr));
210         skb_reset_network_header(skb);
211         hdr = ipv6_hdr(skb);
212
213         /*
214          *      Fill in the IPv6 header
215          */
216         if (np)
217                 hlimit = np->hop_limit;
218         if (hlimit < 0)
219                 hlimit = ip6_dst_hoplimit(dst);
220
221         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
222                                                      np->autoflowlabel, fl6));
223
224         hdr->payload_len = htons(seg_len);
225         hdr->nexthdr = proto;
226         hdr->hop_limit = hlimit;
227
228         hdr->saddr = fl6->saddr;
229         hdr->daddr = *first_hop;
230
231         skb->protocol = htons(ETH_P_IPV6);
232         skb->priority = sk->sk_priority;
233         skb->mark = sk->sk_mark;
234
235         mtu = dst_mtu(dst);
236         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
237                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
238                               IPSTATS_MIB_OUT, skb->len);
239
240                 /* if egress device is enslaved to an L3 master device pass the
241                  * skb to its handler for processing
242                  */
243                 skb = l3mdev_ip6_out((struct sock *)sk, skb);
244                 if (unlikely(!skb))
245                         return 0;
246
247                 /* hooks should never assume socket lock is held.
248                  * we promote our socket to non const
249                  */
250                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
251                                net, (struct sock *)sk, skb, NULL, dst->dev,
252                                dst_output);
253         }
254
255         skb->dev = dst->dev;
256         /* ipv6_local_error() does not require socket lock,
257          * we promote our socket to non const
258          */
259         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
260
261         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
262         kfree_skb(skb);
263         return -EMSGSIZE;
264 }
265 EXPORT_SYMBOL(ip6_xmit);
266
267 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
268 {
269         struct ip6_ra_chain *ra;
270         struct sock *last = NULL;
271
272         read_lock(&ip6_ra_lock);
273         for (ra = ip6_ra_chain; ra; ra = ra->next) {
274                 struct sock *sk = ra->sk;
275                 if (sk && ra->sel == sel &&
276                     (!sk->sk_bound_dev_if ||
277                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
278                         if (last) {
279                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
280                                 if (skb2)
281                                         rawv6_rcv(last, skb2);
282                         }
283                         last = sk;
284                 }
285         }
286
287         if (last) {
288                 rawv6_rcv(last, skb);
289                 read_unlock(&ip6_ra_lock);
290                 return 1;
291         }
292         read_unlock(&ip6_ra_lock);
293         return 0;
294 }
295
296 static int ip6_forward_proxy_check(struct sk_buff *skb)
297 {
298         struct ipv6hdr *hdr = ipv6_hdr(skb);
299         u8 nexthdr = hdr->nexthdr;
300         __be16 frag_off;
301         int offset;
302
303         if (ipv6_ext_hdr(nexthdr)) {
304                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
305                 if (offset < 0)
306                         return 0;
307         } else
308                 offset = sizeof(struct ipv6hdr);
309
310         if (nexthdr == IPPROTO_ICMPV6) {
311                 struct icmp6hdr *icmp6;
312
313                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
314                                          offset + 1 - skb->data)))
315                         return 0;
316
317                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
318
319                 switch (icmp6->icmp6_type) {
320                 case NDISC_ROUTER_SOLICITATION:
321                 case NDISC_ROUTER_ADVERTISEMENT:
322                 case NDISC_NEIGHBOUR_SOLICITATION:
323                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
324                 case NDISC_REDIRECT:
325                         /* For reaction involving unicast neighbor discovery
326                          * message destined to the proxied address, pass it to
327                          * input function.
328                          */
329                         return 1;
330                 default:
331                         break;
332                 }
333         }
334
335         /*
336          * The proxying router can't forward traffic sent to a link-local
337          * address, so signal the sender and discard the packet. This
338          * behavior is clarified by the MIPv6 specification.
339          */
340         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
341                 dst_link_failure(skb);
342                 return -1;
343         }
344
345         return 0;
346 }
347
348 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
349                                      struct sk_buff *skb)
350 {
351         return dst_output(net, sk, skb);
352 }
353
354 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
355 {
356         unsigned int mtu;
357         struct inet6_dev *idev;
358
359         if (dst_metric_locked(dst, RTAX_MTU)) {
360                 mtu = dst_metric_raw(dst, RTAX_MTU);
361                 if (mtu)
362                         return mtu;
363         }
364
365         mtu = IPV6_MIN_MTU;
366         rcu_read_lock();
367         idev = __in6_dev_get(dst->dev);
368         if (idev)
369                 mtu = idev->cnf.mtu6;
370         rcu_read_unlock();
371
372         return mtu;
373 }
374
375 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
376 {
377         if (skb->len <= mtu)
378                 return false;
379
380         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
381         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
382                 return true;
383
384         if (skb->ignore_df)
385                 return false;
386
387         if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
388                 return false;
389
390         return true;
391 }
392
393 int ip6_forward(struct sk_buff *skb)
394 {
395         struct dst_entry *dst = skb_dst(skb);
396         struct ipv6hdr *hdr = ipv6_hdr(skb);
397         struct inet6_skb_parm *opt = IP6CB(skb);
398         struct net *net = dev_net(dst->dev);
399         u32 mtu;
400
401         if (net->ipv6.devconf_all->forwarding == 0)
402                 goto error;
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         if (unlikely(skb->sk))
408                 goto drop;
409
410         if (skb_warn_if_lro(skb))
411                 goto drop;
412
413         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
414                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
415                                 IPSTATS_MIB_INDISCARDS);
416                 goto drop;
417         }
418
419         skb_forward_csum(skb);
420
421         /*
422          *      We DO NOT make any processing on
423          *      RA packets, pushing them to user level AS IS
424          *      without ane WARRANTY that application will be able
425          *      to interpret them. The reason is that we
426          *      cannot make anything clever here.
427          *
428          *      We are not end-node, so that if packet contains
429          *      AH/ESP, we cannot make anything.
430          *      Defragmentation also would be mistake, RA packets
431          *      cannot be fragmented, because there is no warranty
432          *      that different fragments will go along one path. --ANK
433          */
434         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
435                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
436                         return 0;
437         }
438
439         /*
440          *      check and decrement ttl
441          */
442         if (hdr->hop_limit <= 1) {
443                 /* Force OUTPUT device used as source address */
444                 skb->dev = dst->dev;
445                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
446                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
447                                 IPSTATS_MIB_INHDRERRORS);
448
449                 kfree_skb(skb);
450                 return -ETIMEDOUT;
451         }
452
453         /* XXX: idev->cnf.proxy_ndp? */
454         if (net->ipv6.devconf_all->proxy_ndp &&
455             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
456                 int proxied = ip6_forward_proxy_check(skb);
457                 if (proxied > 0)
458                         return ip6_input(skb);
459                 else if (proxied < 0) {
460                         __IP6_INC_STATS(net, ip6_dst_idev(dst),
461                                         IPSTATS_MIB_INDISCARDS);
462                         goto drop;
463                 }
464         }
465
466         if (!xfrm6_route_forward(skb)) {
467                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
468                                 IPSTATS_MIB_INDISCARDS);
469                 goto drop;
470         }
471         dst = skb_dst(skb);
472
473         /* IPv6 specs say nothing about it, but it is clear that we cannot
474            send redirects to source routed frames.
475            We don't send redirects to frames decapsulated from IPsec.
476          */
477         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
478                 struct in6_addr *target = NULL;
479                 struct inet_peer *peer;
480                 struct rt6_info *rt;
481
482                 /*
483                  *      incoming and outgoing devices are the same
484                  *      send a redirect.
485                  */
486
487                 rt = (struct rt6_info *) dst;
488                 if (rt->rt6i_flags & RTF_GATEWAY)
489                         target = &rt->rt6i_gateway;
490                 else
491                         target = &hdr->daddr;
492
493                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
494
495                 /* Limit redirects both by destination (here)
496                    and by source (inside ndisc_send_redirect)
497                  */
498                 if (inet_peer_xrlim_allow(peer, 1*HZ))
499                         ndisc_send_redirect(skb, target);
500                 if (peer)
501                         inet_putpeer(peer);
502         } else {
503                 int addrtype = ipv6_addr_type(&hdr->saddr);
504
505                 /* This check is security critical. */
506                 if (addrtype == IPV6_ADDR_ANY ||
507                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
508                         goto error;
509                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
510                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
511                                     ICMPV6_NOT_NEIGHBOUR, 0);
512                         goto error;
513                 }
514         }
515
516         mtu = ip6_dst_mtu_forward(dst);
517         if (mtu < IPV6_MIN_MTU)
518                 mtu = IPV6_MIN_MTU;
519
520         if (ip6_pkt_too_big(skb, mtu)) {
521                 /* Again, force OUTPUT device used as source address */
522                 skb->dev = dst->dev;
523                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
524                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
525                                 IPSTATS_MIB_INTOOBIGERRORS);
526                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
527                                 IPSTATS_MIB_FRAGFAILS);
528                 kfree_skb(skb);
529                 return -EMSGSIZE;
530         }
531
532         if (skb_cow(skb, dst->dev->hard_header_len)) {
533                 __IP6_INC_STATS(net, ip6_dst_idev(dst),
534                                 IPSTATS_MIB_OUTDISCARDS);
535                 goto drop;
536         }
537
538         hdr = ipv6_hdr(skb);
539
540         /* Mangling hops number delayed to point after skb COW */
541
542         hdr->hop_limit--;
543
544         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
545         __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
546         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
547                        net, NULL, skb, skb->dev, dst->dev,
548                        ip6_forward_finish);
549
550 error:
551         __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
552 drop:
553         kfree_skb(skb);
554         return -EINVAL;
555 }
556
557 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
558 {
559         to->pkt_type = from->pkt_type;
560         to->priority = from->priority;
561         to->protocol = from->protocol;
562         skb_dst_drop(to);
563         skb_dst_set(to, dst_clone(skb_dst(from)));
564         to->dev = from->dev;
565         to->mark = from->mark;
566
567 #ifdef CONFIG_NET_SCHED
568         to->tc_index = from->tc_index;
569 #endif
570         nf_copy(to, from);
571         skb_copy_secmark(to, from);
572 }
573
574 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
575                  int (*output)(struct net *, struct sock *, struct sk_buff *))
576 {
577         struct sk_buff *frag;
578         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
579         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
580                                 inet6_sk(skb->sk) : NULL;
581         struct ipv6hdr *tmp_hdr;
582         struct frag_hdr *fh;
583         unsigned int mtu, hlen, left, len;
584         int hroom, troom;
585         __be32 frag_id;
586         int ptr, offset = 0, err = 0;
587         u8 *prevhdr, nexthdr = 0;
588
589         hlen = ip6_find_1stfragopt(skb, &prevhdr);
590         nexthdr = *prevhdr;
591
592         mtu = ip6_skb_dst_mtu(skb);
593
594         /* We must not fragment if the socket is set to force MTU discovery
595          * or if the skb it not generated by a local socket.
596          */
597         if (unlikely(!skb->ignore_df && skb->len > mtu))
598                 goto fail_toobig;
599
600         if (IP6CB(skb)->frag_max_size) {
601                 if (IP6CB(skb)->frag_max_size > mtu)
602                         goto fail_toobig;
603
604                 /* don't send fragments larger than what we received */
605                 mtu = IP6CB(skb)->frag_max_size;
606                 if (mtu < IPV6_MIN_MTU)
607                         mtu = IPV6_MIN_MTU;
608         }
609
610         if (np && np->frag_size < mtu) {
611                 if (np->frag_size)
612                         mtu = np->frag_size;
613         }
614         if (mtu < hlen + sizeof(struct frag_hdr) + 8)
615                 goto fail_toobig;
616         mtu -= hlen + sizeof(struct frag_hdr);
617
618         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
619                                     &ipv6_hdr(skb)->saddr);
620
621         if (skb->ip_summed == CHECKSUM_PARTIAL &&
622             (err = skb_checksum_help(skb)))
623                 goto fail;
624
625         hroom = LL_RESERVED_SPACE(rt->dst.dev);
626         if (skb_has_frag_list(skb)) {
627                 int first_len = skb_pagelen(skb);
628                 struct sk_buff *frag2;
629
630                 if (first_len - hlen > mtu ||
631                     ((first_len - hlen) & 7) ||
632                     skb_cloned(skb) ||
633                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
634                         goto slow_path;
635
636                 skb_walk_frags(skb, frag) {
637                         /* Correct geometry. */
638                         if (frag->len > mtu ||
639                             ((frag->len & 7) && frag->next) ||
640                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
641                                 goto slow_path_clean;
642
643                         /* Partially cloned skb? */
644                         if (skb_shared(frag))
645                                 goto slow_path_clean;
646
647                         BUG_ON(frag->sk);
648                         if (skb->sk) {
649                                 frag->sk = skb->sk;
650                                 frag->destructor = sock_wfree;
651                         }
652                         skb->truesize -= frag->truesize;
653                 }
654
655                 err = 0;
656                 offset = 0;
657                 /* BUILD HEADER */
658
659                 *prevhdr = NEXTHDR_FRAGMENT;
660                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
661                 if (!tmp_hdr) {
662                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
663                                       IPSTATS_MIB_FRAGFAILS);
664                         err = -ENOMEM;
665                         goto fail;
666                 }
667                 frag = skb_shinfo(skb)->frag_list;
668                 skb_frag_list_init(skb);
669
670                 __skb_pull(skb, hlen);
671                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
672                 __skb_push(skb, hlen);
673                 skb_reset_network_header(skb);
674                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
675
676                 fh->nexthdr = nexthdr;
677                 fh->reserved = 0;
678                 fh->frag_off = htons(IP6_MF);
679                 fh->identification = frag_id;
680
681                 first_len = skb_pagelen(skb);
682                 skb->data_len = first_len - skb_headlen(skb);
683                 skb->len = first_len;
684                 ipv6_hdr(skb)->payload_len = htons(first_len -
685                                                    sizeof(struct ipv6hdr));
686
687                 dst_hold(&rt->dst);
688
689                 for (;;) {
690                         /* Prepare header of the next frame,
691                          * before previous one went down. */
692                         if (frag) {
693                                 frag->ip_summed = CHECKSUM_NONE;
694                                 skb_reset_transport_header(frag);
695                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
696                                 __skb_push(frag, hlen);
697                                 skb_reset_network_header(frag);
698                                 memcpy(skb_network_header(frag), tmp_hdr,
699                                        hlen);
700                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
701                                 fh->nexthdr = nexthdr;
702                                 fh->reserved = 0;
703                                 fh->frag_off = htons(offset);
704                                 if (frag->next)
705                                         fh->frag_off |= htons(IP6_MF);
706                                 fh->identification = frag_id;
707                                 ipv6_hdr(frag)->payload_len =
708                                                 htons(frag->len -
709                                                       sizeof(struct ipv6hdr));
710                                 ip6_copy_metadata(frag, skb);
711                         }
712
713                         err = output(net, sk, skb);
714                         if (!err)
715                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
716                                               IPSTATS_MIB_FRAGCREATES);
717
718                         if (err || !frag)
719                                 break;
720
721                         skb = frag;
722                         frag = skb->next;
723                         skb->next = NULL;
724                 }
725
726                 kfree(tmp_hdr);
727
728                 if (err == 0) {
729                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
730                                       IPSTATS_MIB_FRAGOKS);
731                         ip6_rt_put(rt);
732                         return 0;
733                 }
734
735                 kfree_skb_list(frag);
736
737                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738                               IPSTATS_MIB_FRAGFAILS);
739                 ip6_rt_put(rt);
740                 return err;
741
742 slow_path_clean:
743                 skb_walk_frags(skb, frag2) {
744                         if (frag2 == frag)
745                                 break;
746                         frag2->sk = NULL;
747                         frag2->destructor = NULL;
748                         skb->truesize += frag2->truesize;
749                 }
750         }
751
752 slow_path:
753         left = skb->len - hlen;         /* Space per frame */
754         ptr = hlen;                     /* Where to start from */
755
756         /*
757          *      Fragment the datagram.
758          */
759
760         *prevhdr = NEXTHDR_FRAGMENT;
761         troom = rt->dst.dev->needed_tailroom;
762
763         /*
764          *      Keep copying data until we run out.
765          */
766         while (left > 0)        {
767                 len = left;
768                 /* IF: it doesn't fit, use 'mtu' - the data space left */
769                 if (len > mtu)
770                         len = mtu;
771                 /* IF: we are not sending up to and including the packet end
772                    then align the next start on an eight byte boundary */
773                 if (len < left) {
774                         len &= ~7;
775                 }
776
777                 /* Allocate buffer */
778                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
779                                  hroom + troom, GFP_ATOMIC);
780                 if (!frag) {
781                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
782                                       IPSTATS_MIB_FRAGFAILS);
783                         err = -ENOMEM;
784                         goto fail;
785                 }
786
787                 /*
788                  *      Set up data on packet
789                  */
790
791                 ip6_copy_metadata(frag, skb);
792                 skb_reserve(frag, hroom);
793                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
794                 skb_reset_network_header(frag);
795                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
796                 frag->transport_header = (frag->network_header + hlen +
797                                           sizeof(struct frag_hdr));
798
799                 /*
800                  *      Charge the memory for the fragment to any owner
801                  *      it might possess
802                  */
803                 if (skb->sk)
804                         skb_set_owner_w(frag, skb->sk);
805
806                 /*
807                  *      Copy the packet header into the new buffer.
808                  */
809                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
810
811                 /*
812                  *      Build fragment header.
813                  */
814                 fh->nexthdr = nexthdr;
815                 fh->reserved = 0;
816                 fh->identification = frag_id;
817
818                 /*
819                  *      Copy a block of the IP datagram.
820                  */
821                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
822                                      len));
823                 left -= len;
824
825                 fh->frag_off = htons(offset);
826                 if (left > 0)
827                         fh->frag_off |= htons(IP6_MF);
828                 ipv6_hdr(frag)->payload_len = htons(frag->len -
829                                                     sizeof(struct ipv6hdr));
830
831                 ptr += len;
832                 offset += len;
833
834                 /*
835                  *      Put this fragment into the sending queue.
836                  */
837                 err = output(net, sk, frag);
838                 if (err)
839                         goto fail;
840
841                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
842                               IPSTATS_MIB_FRAGCREATES);
843         }
844         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
845                       IPSTATS_MIB_FRAGOKS);
846         consume_skb(skb);
847         return err;
848
849 fail_toobig:
850         if (skb->sk && dst_allfrag(skb_dst(skb)))
851                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
852
853         skb->dev = skb_dst(skb)->dev;
854         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
855         err = -EMSGSIZE;
856
857 fail:
858         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859                       IPSTATS_MIB_FRAGFAILS);
860         kfree_skb(skb);
861         return err;
862 }
863
864 static inline int ip6_rt_check(const struct rt6key *rt_key,
865                                const struct in6_addr *fl_addr,
866                                const struct in6_addr *addr_cache)
867 {
868         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
869                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
870 }
871
872 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
873                                           struct dst_entry *dst,
874                                           const struct flowi6 *fl6)
875 {
876         struct ipv6_pinfo *np = inet6_sk(sk);
877         struct rt6_info *rt;
878
879         if (!dst)
880                 goto out;
881
882         if (dst->ops->family != AF_INET6) {
883                 dst_release(dst);
884                 return NULL;
885         }
886
887         rt = (struct rt6_info *)dst;
888         /* Yes, checking route validity in not connected
889          * case is not very simple. Take into account,
890          * that we do not support routing by source, TOS,
891          * and MSG_DONTROUTE            --ANK (980726)
892          *
893          * 1. ip6_rt_check(): If route was host route,
894          *    check that cached destination is current.
895          *    If it is network route, we still may
896          *    check its validity using saved pointer
897          *    to the last used address: daddr_cache.
898          *    We do not want to save whole address now,
899          *    (because main consumer of this service
900          *    is tcp, which has not this problem),
901          *    so that the last trick works only on connected
902          *    sockets.
903          * 2. oif also should be the same.
904          */
905         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
906 #ifdef CONFIG_IPV6_SUBTREES
907             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
908 #endif
909            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
910               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
911                 dst_release(dst);
912                 dst = NULL;
913         }
914
915 out:
916         return dst;
917 }
918
919 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
920                                struct dst_entry **dst, struct flowi6 *fl6)
921 {
922 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
923         struct neighbour *n;
924         struct rt6_info *rt;
925 #endif
926         int err;
927         int flags = 0;
928
929         /* The correct way to handle this would be to do
930          * ip6_route_get_saddr, and then ip6_route_output; however,
931          * the route-specific preferred source forces the
932          * ip6_route_output call _before_ ip6_route_get_saddr.
933          *
934          * In source specific routing (no src=any default route),
935          * ip6_route_output will fail given src=any saddr, though, so
936          * that's why we try it again later.
937          */
938         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
939                 struct rt6_info *rt;
940                 bool had_dst = *dst != NULL;
941
942                 if (!had_dst)
943                         *dst = ip6_route_output(net, sk, fl6);
944                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
945                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
946                                           sk ? inet6_sk(sk)->srcprefs : 0,
947                                           &fl6->saddr);
948                 if (err)
949                         goto out_err_release;
950
951                 /* If we had an erroneous initial result, pretend it
952                  * never existed and let the SA-enabled version take
953                  * over.
954                  */
955                 if (!had_dst && (*dst)->error) {
956                         dst_release(*dst);
957                         *dst = NULL;
958                 }
959
960                 if (fl6->flowi6_oif)
961                         flags |= RT6_LOOKUP_F_IFACE;
962         }
963
964         if (!*dst)
965                 *dst = ip6_route_output_flags(net, sk, fl6, flags);
966
967         err = (*dst)->error;
968         if (err)
969                 goto out_err_release;
970
971 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
972         /*
973          * Here if the dst entry we've looked up
974          * has a neighbour entry that is in the INCOMPLETE
975          * state and the src address from the flow is
976          * marked as OPTIMISTIC, we release the found
977          * dst entry and replace it instead with the
978          * dst entry of the nexthop router
979          */
980         rt = (struct rt6_info *) *dst;
981         rcu_read_lock_bh();
982         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
983                                       rt6_nexthop(rt, &fl6->daddr));
984         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
985         rcu_read_unlock_bh();
986
987         if (err) {
988                 struct inet6_ifaddr *ifp;
989                 struct flowi6 fl_gw6;
990                 int redirect;
991
992                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
993                                       (*dst)->dev, 1);
994
995                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
996                 if (ifp)
997                         in6_ifa_put(ifp);
998
999                 if (redirect) {
1000                         /*
1001                          * We need to get the dst entry for the
1002                          * default router instead
1003                          */
1004                         dst_release(*dst);
1005                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1006                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1007                         *dst = ip6_route_output(net, sk, &fl_gw6);
1008                         err = (*dst)->error;
1009                         if (err)
1010                                 goto out_err_release;
1011                 }
1012         }
1013 #endif
1014
1015         return 0;
1016
1017 out_err_release:
1018         dst_release(*dst);
1019         *dst = NULL;
1020
1021         if (err == -ENETUNREACH)
1022                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1023         return err;
1024 }
1025
1026 /**
1027  *      ip6_dst_lookup - perform route lookup on flow
1028  *      @sk: socket which provides route info
1029  *      @dst: pointer to dst_entry * for result
1030  *      @fl6: flow to lookup
1031  *
1032  *      This function performs a route lookup on the given flow.
1033  *
1034  *      It returns zero on success, or a standard errno code on error.
1035  */
1036 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1037                    struct flowi6 *fl6)
1038 {
1039         *dst = NULL;
1040         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1041 }
1042 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1043
1044 /**
1045  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1046  *      @sk: socket which provides route info
1047  *      @fl6: flow to lookup
1048  *      @final_dst: final destination address for ipsec lookup
1049  *
1050  *      This function performs a route lookup on the given flow.
1051  *
1052  *      It returns a valid dst pointer on success, or a pointer encoded
1053  *      error code.
1054  */
1055 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1056                                       const struct in6_addr *final_dst)
1057 {
1058         struct dst_entry *dst = NULL;
1059         int err;
1060
1061         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1062         if (err)
1063                 return ERR_PTR(err);
1064         if (final_dst)
1065                 fl6->daddr = *final_dst;
1066
1067         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1068 }
1069 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1070
1071 /**
1072  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1073  *      @sk: socket which provides the dst cache and route info
1074  *      @fl6: flow to lookup
1075  *      @final_dst: final destination address for ipsec lookup
1076  *
1077  *      This function performs a route lookup on the given flow with the
1078  *      possibility of using the cached route in the socket if it is valid.
1079  *      It will take the socket dst lock when operating on the dst cache.
1080  *      As a result, this function can only be used in process context.
1081  *
1082  *      It returns a valid dst pointer on success, or a pointer encoded
1083  *      error code.
1084  */
1085 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1086                                          const struct in6_addr *final_dst)
1087 {
1088         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1089
1090         dst = ip6_sk_dst_check(sk, dst, fl6);
1091         if (!dst)
1092                 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1093
1094         return dst;
1095 }
1096 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1097
1098 static inline int ip6_ufo_append_data(struct sock *sk,
1099                         struct sk_buff_head *queue,
1100                         int getfrag(void *from, char *to, int offset, int len,
1101                         int odd, struct sk_buff *skb),
1102                         void *from, int length, int hh_len, int fragheaderlen,
1103                         int exthdrlen, int transhdrlen, int mtu,
1104                         unsigned int flags, const struct flowi6 *fl6)
1105
1106 {
1107         struct sk_buff *skb;
1108         int err;
1109
1110         /* There is support for UDP large send offload by network
1111          * device, so create one single skb packet containing complete
1112          * udp datagram
1113          */
1114         skb = skb_peek_tail(queue);
1115         if (!skb) {
1116                 skb = sock_alloc_send_skb(sk,
1117                         hh_len + fragheaderlen + transhdrlen + 20,
1118                         (flags & MSG_DONTWAIT), &err);
1119                 if (!skb)
1120                         return err;
1121
1122                 /* reserve space for Hardware header */
1123                 skb_reserve(skb, hh_len);
1124
1125                 /* create space for UDP/IP header */
1126                 skb_put(skb, fragheaderlen + transhdrlen);
1127
1128                 /* initialize network header pointer */
1129                 skb_set_network_header(skb, exthdrlen);
1130
1131                 /* initialize protocol header pointer */
1132                 skb->transport_header = skb->network_header + fragheaderlen;
1133
1134                 skb->protocol = htons(ETH_P_IPV6);
1135                 skb->csum = 0;
1136
1137                 __skb_queue_tail(queue, skb);
1138         } else if (skb_is_gso(skb)) {
1139                 goto append;
1140         }
1141
1142         skb->ip_summed = CHECKSUM_PARTIAL;
1143         /* Specify the length of each IPv6 datagram fragment.
1144          * It has to be a multiple of 8.
1145          */
1146         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147                                      sizeof(struct frag_hdr)) & ~7;
1148         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1150                                                          &fl6->daddr,
1151                                                          &fl6->saddr);
1152
1153 append:
1154         return skb_append_datato_frags(sk, skb, getfrag, from,
1155                                        (length - transhdrlen));
1156 }
1157
1158 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1159                                                gfp_t gfp)
1160 {
1161         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1162 }
1163
1164 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1165                                                 gfp_t gfp)
1166 {
1167         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1168 }
1169
1170 static void ip6_append_data_mtu(unsigned int *mtu,
1171                                 int *maxfraglen,
1172                                 unsigned int fragheaderlen,
1173                                 struct sk_buff *skb,
1174                                 struct rt6_info *rt,
1175                                 unsigned int orig_mtu)
1176 {
1177         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1178                 if (!skb) {
1179                         /* first fragment, reserve header_len */
1180                         *mtu = orig_mtu - rt->dst.header_len;
1181
1182                 } else {
1183                         /*
1184                          * this fragment is not first, the headers
1185                          * space is regarded as data space.
1186                          */
1187                         *mtu = orig_mtu;
1188                 }
1189                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1190                               + fragheaderlen - sizeof(struct frag_hdr);
1191         }
1192 }
1193
1194 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1195                           struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1196                           struct rt6_info *rt, struct flowi6 *fl6)
1197 {
1198         struct ipv6_pinfo *np = inet6_sk(sk);
1199         unsigned int mtu;
1200         struct ipv6_txoptions *opt = ipc6->opt;
1201
1202         /*
1203          * setup for corking
1204          */
1205         if (opt) {
1206                 if (WARN_ON(v6_cork->opt))
1207                         return -EINVAL;
1208
1209                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1210                 if (unlikely(!v6_cork->opt))
1211                         return -ENOBUFS;
1212
1213                 v6_cork->opt->tot_len = opt->tot_len;
1214                 v6_cork->opt->opt_flen = opt->opt_flen;
1215                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1216
1217                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1218                                                     sk->sk_allocation);
1219                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1220                         return -ENOBUFS;
1221
1222                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1223                                                     sk->sk_allocation);
1224                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1225                         return -ENOBUFS;
1226
1227                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1228                                                    sk->sk_allocation);
1229                 if (opt->hopopt && !v6_cork->opt->hopopt)
1230                         return -ENOBUFS;
1231
1232                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1233                                                     sk->sk_allocation);
1234                 if (opt->srcrt && !v6_cork->opt->srcrt)
1235                         return -ENOBUFS;
1236
1237                 /* need source address above miyazawa*/
1238         }
1239         dst_hold(&rt->dst);
1240         cork->base.dst = &rt->dst;
1241         cork->fl.u.ip6 = *fl6;
1242         v6_cork->hop_limit = ipc6->hlimit;
1243         v6_cork->tclass = ipc6->tclass;
1244         if (rt->dst.flags & DST_XFRM_TUNNEL)
1245                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1246                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1247         else
1248                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1249                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1250         if (np->frag_size < mtu) {
1251                 if (np->frag_size)
1252                         mtu = np->frag_size;
1253         }
1254         cork->base.fragsize = mtu;
1255         if (dst_allfrag(rt->dst.path))
1256                 cork->base.flags |= IPCORK_ALLFRAG;
1257         cork->base.length = 0;
1258
1259         return 0;
1260 }
1261
1262 static int __ip6_append_data(struct sock *sk,
1263                              struct flowi6 *fl6,
1264                              struct sk_buff_head *queue,
1265                              struct inet_cork *cork,
1266                              struct inet6_cork *v6_cork,
1267                              struct page_frag *pfrag,
1268                              int getfrag(void *from, char *to, int offset,
1269                                          int len, int odd, struct sk_buff *skb),
1270                              void *from, int length, int transhdrlen,
1271                              unsigned int flags, struct ipcm6_cookie *ipc6,
1272                              const struct sockcm_cookie *sockc)
1273 {
1274         struct sk_buff *skb, *skb_prev = NULL;
1275         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1276         int exthdrlen = 0;
1277         int dst_exthdrlen = 0;
1278         int hh_len;
1279         int copy;
1280         int err;
1281         int offset = 0;
1282         __u8 tx_flags = 0;
1283         u32 tskey = 0;
1284         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1285         struct ipv6_txoptions *opt = v6_cork->opt;
1286         int csummode = CHECKSUM_NONE;
1287         unsigned int maxnonfragsize, headersize;
1288
1289         skb = skb_peek_tail(queue);
1290         if (!skb) {
1291                 exthdrlen = opt ? opt->opt_flen : 0;
1292                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1293         }
1294
1295         mtu = cork->fragsize;
1296         orig_mtu = mtu;
1297
1298         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1299
1300         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1301                         (opt ? opt->opt_nflen : 0);
1302         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1303                      sizeof(struct frag_hdr);
1304
1305         headersize = sizeof(struct ipv6hdr) +
1306                      (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1307                      (dst_allfrag(&rt->dst) ?
1308                       sizeof(struct frag_hdr) : 0) +
1309                      rt->rt6i_nfheader_len;
1310
1311         if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1312             (sk->sk_protocol == IPPROTO_UDP ||
1313              sk->sk_protocol == IPPROTO_RAW)) {
1314                 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1315                                 sizeof(struct ipv6hdr));
1316                 goto emsgsize;
1317         }
1318
1319         if (ip6_sk_ignore_df(sk))
1320                 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1321         else
1322                 maxnonfragsize = mtu;
1323
1324         if (cork->length + length > maxnonfragsize - headersize) {
1325 emsgsize:
1326                 ipv6_local_error(sk, EMSGSIZE, fl6,
1327                                  mtu - headersize +
1328                                  sizeof(struct ipv6hdr));
1329                 return -EMSGSIZE;
1330         }
1331
1332         /* CHECKSUM_PARTIAL only with no extension headers and when
1333          * we are not going to fragment
1334          */
1335         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1336             headersize == sizeof(struct ipv6hdr) &&
1337             length < mtu - headersize &&
1338             !(flags & MSG_MORE) &&
1339             rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1340                 csummode = CHECKSUM_PARTIAL;
1341
1342         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1343                 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1344                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1345                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1346                         tskey = sk->sk_tskey++;
1347         }
1348
1349         /*
1350          * Let's try using as much space as possible.
1351          * Use MTU if total length of the message fits into the MTU.
1352          * Otherwise, we need to reserve fragment header and
1353          * fragment alignment (= 8-15 octects, in total).
1354          *
1355          * Note that we may need to "move" the data from the tail of
1356          * of the buffer to the new fragment when we split
1357          * the message.
1358          *
1359          * FIXME: It may be fragmented into multiple chunks
1360          *        at once if non-fragmentable extension headers
1361          *        are too large.
1362          * --yoshfuji
1363          */
1364
1365         cork->length += length;
1366         if (((length > mtu) ||
1367              (skb && skb_is_gso(skb))) &&
1368             (sk->sk_protocol == IPPROTO_UDP) &&
1369             (rt->dst.dev->features & NETIF_F_UFO) &&
1370             (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1371                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1372                                           hh_len, fragheaderlen, exthdrlen,
1373                                           transhdrlen, mtu, flags, fl6);
1374                 if (err)
1375                         goto error;
1376                 return 0;
1377         }
1378
1379         if (!skb)
1380                 goto alloc_new_skb;
1381
1382         while (length > 0) {
1383                 /* Check if the remaining data fits into current packet. */
1384                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1385                 if (copy < length)
1386                         copy = maxfraglen - skb->len;
1387
1388                 if (copy <= 0) {
1389                         char *data;
1390                         unsigned int datalen;
1391                         unsigned int fraglen;
1392                         unsigned int fraggap;
1393                         unsigned int alloclen;
1394 alloc_new_skb:
1395                         /* There's no room in the current skb */
1396                         if (skb)
1397                                 fraggap = skb->len - maxfraglen;
1398                         else
1399                                 fraggap = 0;
1400                         /* update mtu and maxfraglen if necessary */
1401                         if (!skb || !skb_prev)
1402                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1403                                                     fragheaderlen, skb, rt,
1404                                                     orig_mtu);
1405
1406                         skb_prev = skb;
1407
1408                         /*
1409                          * If remaining data exceeds the mtu,
1410                          * we know we need more fragment(s).
1411                          */
1412                         datalen = length + fraggap;
1413
1414                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1415                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1416                         if ((flags & MSG_MORE) &&
1417                             !(rt->dst.dev->features&NETIF_F_SG))
1418                                 alloclen = mtu;
1419                         else
1420                                 alloclen = datalen + fragheaderlen;
1421
1422                         alloclen += dst_exthdrlen;
1423
1424                         if (datalen != length + fraggap) {
1425                                 /*
1426                                  * this is not the last fragment, the trailer
1427                                  * space is regarded as data space.
1428                                  */
1429                                 datalen += rt->dst.trailer_len;
1430                         }
1431
1432                         alloclen += rt->dst.trailer_len;
1433                         fraglen = datalen + fragheaderlen;
1434
1435                         /*
1436                          * We just reserve space for fragment header.
1437                          * Note: this may be overallocation if the message
1438                          * (without MSG_MORE) fits into the MTU.
1439                          */
1440                         alloclen += sizeof(struct frag_hdr);
1441
1442                         if (transhdrlen) {
1443                                 skb = sock_alloc_send_skb(sk,
1444                                                 alloclen + hh_len,
1445                                                 (flags & MSG_DONTWAIT), &err);
1446                         } else {
1447                                 skb = NULL;
1448                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1449                                     2 * sk->sk_sndbuf)
1450                                         skb = sock_wmalloc(sk,
1451                                                            alloclen + hh_len, 1,
1452                                                            sk->sk_allocation);
1453                                 if (unlikely(!skb))
1454                                         err = -ENOBUFS;
1455                         }
1456                         if (!skb)
1457                                 goto error;
1458                         /*
1459                          *      Fill in the control structures
1460                          */
1461                         skb->protocol = htons(ETH_P_IPV6);
1462                         skb->ip_summed = csummode;
1463                         skb->csum = 0;
1464                         /* reserve for fragmentation and ipsec header */
1465                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1466                                     dst_exthdrlen);
1467
1468                         /* Only the initial fragment is time stamped */
1469                         skb_shinfo(skb)->tx_flags = tx_flags;
1470                         tx_flags = 0;
1471                         skb_shinfo(skb)->tskey = tskey;
1472                         tskey = 0;
1473
1474                         /*
1475                          *      Find where to start putting bytes
1476                          */
1477                         data = skb_put(skb, fraglen);
1478                         skb_set_network_header(skb, exthdrlen);
1479                         data += fragheaderlen;
1480                         skb->transport_header = (skb->network_header +
1481                                                  fragheaderlen);
1482                         if (fraggap) {
1483                                 skb->csum = skb_copy_and_csum_bits(
1484                                         skb_prev, maxfraglen,
1485                                         data + transhdrlen, fraggap, 0);
1486                                 skb_prev->csum = csum_sub(skb_prev->csum,
1487                                                           skb->csum);
1488                                 data += fraggap;
1489                                 pskb_trim_unique(skb_prev, maxfraglen);
1490                         }
1491                         copy = datalen - transhdrlen - fraggap;
1492
1493                         if (copy < 0) {
1494                                 err = -EINVAL;
1495                                 kfree_skb(skb);
1496                                 goto error;
1497                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1498                                 err = -EFAULT;
1499                                 kfree_skb(skb);
1500                                 goto error;
1501                         }
1502
1503                         offset += copy;
1504                         length -= datalen - fraggap;
1505                         transhdrlen = 0;
1506                         exthdrlen = 0;
1507                         dst_exthdrlen = 0;
1508
1509                         /*
1510                          * Put the packet on the pending queue
1511                          */
1512                         __skb_queue_tail(queue, skb);
1513                         continue;
1514                 }
1515
1516                 if (copy > length)
1517                         copy = length;
1518
1519                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1520                         unsigned int off;
1521
1522                         off = skb->len;
1523                         if (getfrag(from, skb_put(skb, copy),
1524                                                 offset, copy, off, skb) < 0) {
1525                                 __skb_trim(skb, off);
1526                                 err = -EFAULT;
1527                                 goto error;
1528                         }
1529                 } else {
1530                         int i = skb_shinfo(skb)->nr_frags;
1531
1532                         err = -ENOMEM;
1533                         if (!sk_page_frag_refill(sk, pfrag))
1534                                 goto error;
1535
1536                         if (!skb_can_coalesce(skb, i, pfrag->page,
1537                                               pfrag->offset)) {
1538                                 err = -EMSGSIZE;
1539                                 if (i == MAX_SKB_FRAGS)
1540                                         goto error;
1541
1542                                 __skb_fill_page_desc(skb, i, pfrag->page,
1543                                                      pfrag->offset, 0);
1544                                 skb_shinfo(skb)->nr_frags = ++i;
1545                                 get_page(pfrag->page);
1546                         }
1547                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1548                         if (getfrag(from,
1549                                     page_address(pfrag->page) + pfrag->offset,
1550                                     offset, copy, skb->len, skb) < 0)
1551                                 goto error_efault;
1552
1553                         pfrag->offset += copy;
1554                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1555                         skb->len += copy;
1556                         skb->data_len += copy;
1557                         skb->truesize += copy;
1558                         atomic_add(copy, &sk->sk_wmem_alloc);
1559                 }
1560                 offset += copy;
1561                 length -= copy;
1562         }
1563
1564         return 0;
1565
1566 error_efault:
1567         err = -EFAULT;
1568 error:
1569         cork->length -= length;
1570         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1571         return err;
1572 }
1573
1574 int ip6_append_data(struct sock *sk,
1575                     int getfrag(void *from, char *to, int offset, int len,
1576                                 int odd, struct sk_buff *skb),
1577                     void *from, int length, int transhdrlen,
1578                     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1579                     struct rt6_info *rt, unsigned int flags,
1580                     const struct sockcm_cookie *sockc)
1581 {
1582         struct inet_sock *inet = inet_sk(sk);
1583         struct ipv6_pinfo *np = inet6_sk(sk);
1584         int exthdrlen;
1585         int err;
1586
1587         if (flags&MSG_PROBE)
1588                 return 0;
1589         if (skb_queue_empty(&sk->sk_write_queue)) {
1590                 /*
1591                  * setup for corking
1592                  */
1593                 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1594                                      ipc6, rt, fl6);
1595                 if (err)
1596                         return err;
1597
1598                 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1599                 length += exthdrlen;
1600                 transhdrlen += exthdrlen;
1601         } else {
1602                 fl6 = &inet->cork.fl.u.ip6;
1603                 transhdrlen = 0;
1604         }
1605
1606         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1607                                  &np->cork, sk_page_frag(sk), getfrag,
1608                                  from, length, transhdrlen, flags, ipc6, sockc);
1609 }
1610 EXPORT_SYMBOL_GPL(ip6_append_data);
1611
1612 static void ip6_cork_release(struct inet_cork_full *cork,
1613                              struct inet6_cork *v6_cork)
1614 {
1615         if (v6_cork->opt) {
1616                 kfree(v6_cork->opt->dst0opt);
1617                 kfree(v6_cork->opt->dst1opt);
1618                 kfree(v6_cork->opt->hopopt);
1619                 kfree(v6_cork->opt->srcrt);
1620                 kfree(v6_cork->opt);
1621                 v6_cork->opt = NULL;
1622         }
1623
1624         if (cork->base.dst) {
1625                 dst_release(cork->base.dst);
1626                 cork->base.dst = NULL;
1627                 cork->base.flags &= ~IPCORK_ALLFRAG;
1628         }
1629         memset(&cork->fl, 0, sizeof(cork->fl));
1630 }
1631
1632 struct sk_buff *__ip6_make_skb(struct sock *sk,
1633                                struct sk_buff_head *queue,
1634                                struct inet_cork_full *cork,
1635                                struct inet6_cork *v6_cork)
1636 {
1637         struct sk_buff *skb, *tmp_skb;
1638         struct sk_buff **tail_skb;
1639         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1640         struct ipv6_pinfo *np = inet6_sk(sk);
1641         struct net *net = sock_net(sk);
1642         struct ipv6hdr *hdr;
1643         struct ipv6_txoptions *opt = v6_cork->opt;
1644         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1645         struct flowi6 *fl6 = &cork->fl.u.ip6;
1646         unsigned char proto = fl6->flowi6_proto;
1647
1648         skb = __skb_dequeue(queue);
1649         if (!skb)
1650                 goto out;
1651         tail_skb = &(skb_shinfo(skb)->frag_list);
1652
1653         /* move skb->data to ip header from ext header */
1654         if (skb->data < skb_network_header(skb))
1655                 __skb_pull(skb, skb_network_offset(skb));
1656         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1657                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1658                 *tail_skb = tmp_skb;
1659                 tail_skb = &(tmp_skb->next);
1660                 skb->len += tmp_skb->len;
1661                 skb->data_len += tmp_skb->len;
1662                 skb->truesize += tmp_skb->truesize;
1663                 tmp_skb->destructor = NULL;
1664                 tmp_skb->sk = NULL;
1665         }
1666
1667         /* Allow local fragmentation. */
1668         skb->ignore_df = ip6_sk_ignore_df(sk);
1669
1670         *final_dst = fl6->daddr;
1671         __skb_pull(skb, skb_network_header_len(skb));
1672         if (opt && opt->opt_flen)
1673                 ipv6_push_frag_opts(skb, opt, &proto);
1674         if (opt && opt->opt_nflen)
1675                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1676
1677         skb_push(skb, sizeof(struct ipv6hdr));
1678         skb_reset_network_header(skb);
1679         hdr = ipv6_hdr(skb);
1680
1681         ip6_flow_hdr(hdr, v6_cork->tclass,
1682                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1683                                         np->autoflowlabel, fl6));
1684         hdr->hop_limit = v6_cork->hop_limit;
1685         hdr->nexthdr = proto;
1686         hdr->saddr = fl6->saddr;
1687         hdr->daddr = *final_dst;
1688
1689         skb->priority = sk->sk_priority;
1690         skb->mark = sk->sk_mark;
1691
1692         skb_dst_set(skb, dst_clone(&rt->dst));
1693         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1694         if (proto == IPPROTO_ICMPV6) {
1695                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1696
1697                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1698                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1699         }
1700
1701         ip6_cork_release(cork, v6_cork);
1702 out:
1703         return skb;
1704 }
1705
1706 int ip6_send_skb(struct sk_buff *skb)
1707 {
1708         struct net *net = sock_net(skb->sk);
1709         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1710         int err;
1711
1712         err = ip6_local_out(net, skb->sk, skb);
1713         if (err) {
1714                 if (err > 0)
1715                         err = net_xmit_errno(err);
1716                 if (err)
1717                         IP6_INC_STATS(net, rt->rt6i_idev,
1718                                       IPSTATS_MIB_OUTDISCARDS);
1719         }
1720
1721         return err;
1722 }
1723
1724 int ip6_push_pending_frames(struct sock *sk)
1725 {
1726         struct sk_buff *skb;
1727
1728         skb = ip6_finish_skb(sk);
1729         if (!skb)
1730                 return 0;
1731
1732         return ip6_send_skb(skb);
1733 }
1734 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1735
1736 static void __ip6_flush_pending_frames(struct sock *sk,
1737                                        struct sk_buff_head *queue,
1738                                        struct inet_cork_full *cork,
1739                                        struct inet6_cork *v6_cork)
1740 {
1741         struct sk_buff *skb;
1742
1743         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1744                 if (skb_dst(skb))
1745                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1746                                       IPSTATS_MIB_OUTDISCARDS);
1747                 kfree_skb(skb);
1748         }
1749
1750         ip6_cork_release(cork, v6_cork);
1751 }
1752
1753 void ip6_flush_pending_frames(struct sock *sk)
1754 {
1755         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1756                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1757 }
1758 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1759
1760 struct sk_buff *ip6_make_skb(struct sock *sk,
1761                              int getfrag(void *from, char *to, int offset,
1762                                          int len, int odd, struct sk_buff *skb),
1763                              void *from, int length, int transhdrlen,
1764                              struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1765                              struct rt6_info *rt, unsigned int flags,
1766                              const struct sockcm_cookie *sockc)
1767 {
1768         struct inet_cork_full cork;
1769         struct inet6_cork v6_cork;
1770         struct sk_buff_head queue;
1771         int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1772         int err;
1773
1774         if (flags & MSG_PROBE)
1775                 return NULL;
1776
1777         __skb_queue_head_init(&queue);
1778
1779         cork.base.flags = 0;
1780         cork.base.addr = 0;
1781         cork.base.opt = NULL;
1782         v6_cork.opt = NULL;
1783         err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1784         if (err)
1785                 return ERR_PTR(err);
1786
1787         if (ipc6->dontfrag < 0)
1788                 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1789
1790         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1791                                 &current->task_frag, getfrag, from,
1792                                 length + exthdrlen, transhdrlen + exthdrlen,
1793                                 flags, ipc6, sockc);
1794         if (err) {
1795                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1796                 return ERR_PTR(err);
1797         }
1798
1799         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1800 }