]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv6/ip6_output.c
rbd: require stable pages if message data CRCs are enabled
[karo-tx-linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61         struct dst_entry *dst = skb_dst(skb);
62         struct net_device *dev = dst->dev;
63         struct neighbour *neigh;
64         struct in6_addr *nexthop;
65         int ret;
66
67         skb->protocol = htons(ETH_P_IPV6);
68         skb->dev = dev;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_socket(dev_net(dev), skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(dev_net(dev), idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97                                 skb->len);
98
99                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100                     IPV6_ADDR_SCOPE_NODELOCAL &&
101                     !(dev->flags & IFF_LOOPBACK)) {
102                         kfree_skb(skb);
103                         return 0;
104                 }
105         }
106
107         rcu_read_lock_bh();
108         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110         if (unlikely(!neigh))
111                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112         if (!IS_ERR(neigh)) {
113                 ret = dst_neigh_output(dst, neigh, skb);
114                 rcu_read_unlock_bh();
115                 return ret;
116         }
117         rcu_read_unlock_bh();
118
119         IP6_INC_STATS(dev_net(dst->dev),
120                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121         kfree_skb(skb);
122         return -EINVAL;
123 }
124
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128             dst_allfrag(skb_dst(skb)) ||
129             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130                 return ip6_fragment(sk, skb, ip6_finish_output2);
131         else
132                 return ip6_finish_output2(sk, skb);
133 }
134
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137         struct net_device *dev = skb_dst(skb)->dev;
138         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139         if (unlikely(idev->cnf.disable_ipv6)) {
140                 IP6_INC_STATS(dev_net(dev), idev,
141                               IPSTATS_MIB_OUTDISCARDS);
142                 kfree_skb(skb);
143                 return 0;
144         }
145
146         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147                             NULL, dev,
148                             ip6_finish_output,
149                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151
152 /*
153  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157              struct ipv6_txoptions *opt, int tclass)
158 {
159         struct net *net = sock_net(sk);
160         struct ipv6_pinfo *np = inet6_sk(sk);
161         struct in6_addr *first_hop = &fl6->daddr;
162         struct dst_entry *dst = skb_dst(skb);
163         struct ipv6hdr *hdr;
164         u8  proto = fl6->flowi6_proto;
165         int seg_len = skb->len;
166         int hlimit = -1;
167         u32 mtu;
168
169         if (opt) {
170                 unsigned int head_room;
171
172                 /* First: exthdrs may take lots of space (~8K for now)
173                    MAX_HEADER is not enough.
174                  */
175                 head_room = opt->opt_nflen + opt->opt_flen;
176                 seg_len += head_room;
177                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178
179                 if (skb_headroom(skb) < head_room) {
180                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181                         if (!skb2) {
182                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183                                               IPSTATS_MIB_OUTDISCARDS);
184                                 kfree_skb(skb);
185                                 return -ENOBUFS;
186                         }
187                         consume_skb(skb);
188                         skb = skb2;
189                         skb_set_owner_w(skb, sk);
190                 }
191                 if (opt->opt_flen)
192                         ipv6_push_frag_opts(skb, opt, &proto);
193                 if (opt->opt_nflen)
194                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195         }
196
197         skb_push(skb, sizeof(struct ipv6hdr));
198         skb_reset_network_header(skb);
199         hdr = ipv6_hdr(skb);
200
201         /*
202          *      Fill in the IPv6 header
203          */
204         if (np)
205                 hlimit = np->hop_limit;
206         if (hlimit < 0)
207                 hlimit = ip6_dst_hoplimit(dst);
208
209         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210                                                      np->autoflowlabel, fl6));
211
212         hdr->payload_len = htons(seg_len);
213         hdr->nexthdr = proto;
214         hdr->hop_limit = hlimit;
215
216         hdr->saddr = fl6->saddr;
217         hdr->daddr = *first_hop;
218
219         skb->protocol = htons(ETH_P_IPV6);
220         skb->priority = sk->sk_priority;
221         skb->mark = sk->sk_mark;
222
223         mtu = dst_mtu(dst);
224         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226                               IPSTATS_MIB_OUT, skb->len);
227                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228                                NULL, dst->dev, dst_output_sk);
229         }
230
231         skb->dev = dst->dev;
232         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234         kfree_skb(skb);
235         return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241         struct ip6_ra_chain *ra;
242         struct sock *last = NULL;
243
244         read_lock(&ip6_ra_lock);
245         for (ra = ip6_ra_chain; ra; ra = ra->next) {
246                 struct sock *sk = ra->sk;
247                 if (sk && ra->sel == sel &&
248                     (!sk->sk_bound_dev_if ||
249                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
250                         if (last) {
251                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252                                 if (skb2)
253                                         rawv6_rcv(last, skb2);
254                         }
255                         last = sk;
256                 }
257         }
258
259         if (last) {
260                 rawv6_rcv(last, skb);
261                 read_unlock(&ip6_ra_lock);
262                 return 1;
263         }
264         read_unlock(&ip6_ra_lock);
265         return 0;
266 }
267
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270         struct ipv6hdr *hdr = ipv6_hdr(skb);
271         u8 nexthdr = hdr->nexthdr;
272         __be16 frag_off;
273         int offset;
274
275         if (ipv6_ext_hdr(nexthdr)) {
276                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277                 if (offset < 0)
278                         return 0;
279         } else
280                 offset = sizeof(struct ipv6hdr);
281
282         if (nexthdr == IPPROTO_ICMPV6) {
283                 struct icmp6hdr *icmp6;
284
285                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
286                                          offset + 1 - skb->data)))
287                         return 0;
288
289                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290
291                 switch (icmp6->icmp6_type) {
292                 case NDISC_ROUTER_SOLICITATION:
293                 case NDISC_ROUTER_ADVERTISEMENT:
294                 case NDISC_NEIGHBOUR_SOLICITATION:
295                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
296                 case NDISC_REDIRECT:
297                         /* For reaction involving unicast neighbor discovery
298                          * message destined to the proxied address, pass it to
299                          * input function.
300                          */
301                         return 1;
302                 default:
303                         break;
304                 }
305         }
306
307         /*
308          * The proxying router can't forward traffic sent to a link-local
309          * address, so signal the sender and discard the packet. This
310          * behavior is clarified by the MIPv6 specification.
311          */
312         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313                 dst_link_failure(skb);
314                 return -1;
315         }
316
317         return 0;
318 }
319
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322         skb_sender_cpu_clear(skb);
323         return dst_output_sk(sk, skb);
324 }
325
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328         unsigned int mtu;
329         struct inet6_dev *idev;
330
331         if (dst_metric_locked(dst, RTAX_MTU)) {
332                 mtu = dst_metric_raw(dst, RTAX_MTU);
333                 if (mtu)
334                         return mtu;
335         }
336
337         mtu = IPV6_MIN_MTU;
338         rcu_read_lock();
339         idev = __in6_dev_get(dst->dev);
340         if (idev)
341                 mtu = idev->cnf.mtu6;
342         rcu_read_unlock();
343
344         return mtu;
345 }
346
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349         if (skb->len <= mtu)
350                 return false;
351
352         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354                 return true;
355
356         if (skb->ignore_df)
357                 return false;
358
359         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360                 return false;
361
362         return true;
363 }
364
365 int ip6_forward(struct sk_buff *skb)
366 {
367         struct dst_entry *dst = skb_dst(skb);
368         struct ipv6hdr *hdr = ipv6_hdr(skb);
369         struct inet6_skb_parm *opt = IP6CB(skb);
370         struct net *net = dev_net(dst->dev);
371         u32 mtu;
372
373         if (net->ipv6.devconf_all->forwarding == 0)
374                 goto error;
375
376         if (skb->pkt_type != PACKET_HOST)
377                 goto drop;
378
379         if (unlikely(skb->sk))
380                 goto drop;
381
382         if (skb_warn_if_lro(skb))
383                 goto drop;
384
385         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
386                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
387                                  IPSTATS_MIB_INDISCARDS);
388                 goto drop;
389         }
390
391         skb_forward_csum(skb);
392
393         /*
394          *      We DO NOT make any processing on
395          *      RA packets, pushing them to user level AS IS
396          *      without ane WARRANTY that application will be able
397          *      to interpret them. The reason is that we
398          *      cannot make anything clever here.
399          *
400          *      We are not end-node, so that if packet contains
401          *      AH/ESP, we cannot make anything.
402          *      Defragmentation also would be mistake, RA packets
403          *      cannot be fragmented, because there is no warranty
404          *      that different fragments will go along one path. --ANK
405          */
406         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
407                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
408                         return 0;
409         }
410
411         /*
412          *      check and decrement ttl
413          */
414         if (hdr->hop_limit <= 1) {
415                 /* Force OUTPUT device used as source address */
416                 skb->dev = dst->dev;
417                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
419                                  IPSTATS_MIB_INHDRERRORS);
420
421                 kfree_skb(skb);
422                 return -ETIMEDOUT;
423         }
424
425         /* XXX: idev->cnf.proxy_ndp? */
426         if (net->ipv6.devconf_all->proxy_ndp &&
427             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428                 int proxied = ip6_forward_proxy_check(skb);
429                 if (proxied > 0)
430                         return ip6_input(skb);
431                 else if (proxied < 0) {
432                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
433                                          IPSTATS_MIB_INDISCARDS);
434                         goto drop;
435                 }
436         }
437
438         if (!xfrm6_route_forward(skb)) {
439                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
440                                  IPSTATS_MIB_INDISCARDS);
441                 goto drop;
442         }
443         dst = skb_dst(skb);
444
445         /* IPv6 specs say nothing about it, but it is clear that we cannot
446            send redirects to source routed frames.
447            We don't send redirects to frames decapsulated from IPsec.
448          */
449         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
450                 struct in6_addr *target = NULL;
451                 struct inet_peer *peer;
452                 struct rt6_info *rt;
453
454                 /*
455                  *      incoming and outgoing devices are the same
456                  *      send a redirect.
457                  */
458
459                 rt = (struct rt6_info *) dst;
460                 if (rt->rt6i_flags & RTF_GATEWAY)
461                         target = &rt->rt6i_gateway;
462                 else
463                         target = &hdr->daddr;
464
465                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
466
467                 /* Limit redirects both by destination (here)
468                    and by source (inside ndisc_send_redirect)
469                  */
470                 if (inet_peer_xrlim_allow(peer, 1*HZ))
471                         ndisc_send_redirect(skb, target);
472                 if (peer)
473                         inet_putpeer(peer);
474         } else {
475                 int addrtype = ipv6_addr_type(&hdr->saddr);
476
477                 /* This check is security critical. */
478                 if (addrtype == IPV6_ADDR_ANY ||
479                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480                         goto error;
481                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
482                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
483                                     ICMPV6_NOT_NEIGHBOUR, 0);
484                         goto error;
485                 }
486         }
487
488         mtu = ip6_dst_mtu_forward(dst);
489         if (mtu < IPV6_MIN_MTU)
490                 mtu = IPV6_MIN_MTU;
491
492         if (ip6_pkt_too_big(skb, mtu)) {
493                 /* Again, force OUTPUT device used as source address */
494                 skb->dev = dst->dev;
495                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
497                                  IPSTATS_MIB_INTOOBIGERRORS);
498                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
499                                  IPSTATS_MIB_FRAGFAILS);
500                 kfree_skb(skb);
501                 return -EMSGSIZE;
502         }
503
504         if (skb_cow(skb, dst->dev->hard_header_len)) {
505                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
506                                  IPSTATS_MIB_OUTDISCARDS);
507                 goto drop;
508         }
509
510         hdr = ipv6_hdr(skb);
511
512         /* Mangling hops number delayed to point after skb COW */
513
514         hdr->hop_limit--;
515
516         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
517         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
518         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
519                        skb->dev, dst->dev,
520                        ip6_forward_finish);
521
522 error:
523         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
524 drop:
525         kfree_skb(skb);
526         return -EINVAL;
527 }
528
529 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
530 {
531         to->pkt_type = from->pkt_type;
532         to->priority = from->priority;
533         to->protocol = from->protocol;
534         skb_dst_drop(to);
535         skb_dst_set(to, dst_clone(skb_dst(from)));
536         to->dev = from->dev;
537         to->mark = from->mark;
538
539 #ifdef CONFIG_NET_SCHED
540         to->tc_index = from->tc_index;
541 #endif
542         nf_copy(to, from);
543         skb_copy_secmark(to, from);
544 }
545
546 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
547                  int (*output)(struct sock *, struct sk_buff *))
548 {
549         struct sk_buff *frag;
550         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
551         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
552                                 inet6_sk(skb->sk) : NULL;
553         struct ipv6hdr *tmp_hdr;
554         struct frag_hdr *fh;
555         unsigned int mtu, hlen, left, len;
556         int hroom, troom;
557         __be32 frag_id;
558         int ptr, offset = 0, err = 0;
559         u8 *prevhdr, nexthdr = 0;
560         struct net *net = dev_net(skb_dst(skb)->dev);
561
562         hlen = ip6_find_1stfragopt(skb, &prevhdr);
563         nexthdr = *prevhdr;
564
565         mtu = ip6_skb_dst_mtu(skb);
566
567         /* We must not fragment if the socket is set to force MTU discovery
568          * or if the skb it not generated by a local socket.
569          */
570         if (unlikely(!skb->ignore_df && skb->len > mtu))
571                 goto fail_toobig;
572
573         if (IP6CB(skb)->frag_max_size) {
574                 if (IP6CB(skb)->frag_max_size > mtu)
575                         goto fail_toobig;
576
577                 /* don't send fragments larger than what we received */
578                 mtu = IP6CB(skb)->frag_max_size;
579                 if (mtu < IPV6_MIN_MTU)
580                         mtu = IPV6_MIN_MTU;
581         }
582
583         if (np && np->frag_size < mtu) {
584                 if (np->frag_size)
585                         mtu = np->frag_size;
586         }
587         mtu -= hlen + sizeof(struct frag_hdr);
588
589         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
590                                     &ipv6_hdr(skb)->saddr);
591
592         hroom = LL_RESERVED_SPACE(rt->dst.dev);
593         if (skb_has_frag_list(skb)) {
594                 int first_len = skb_pagelen(skb);
595                 struct sk_buff *frag2;
596
597                 if (first_len - hlen > mtu ||
598                     ((first_len - hlen) & 7) ||
599                     skb_cloned(skb) ||
600                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
601                         goto slow_path;
602
603                 skb_walk_frags(skb, frag) {
604                         /* Correct geometry. */
605                         if (frag->len > mtu ||
606                             ((frag->len & 7) && frag->next) ||
607                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
608                                 goto slow_path_clean;
609
610                         /* Partially cloned skb? */
611                         if (skb_shared(frag))
612                                 goto slow_path_clean;
613
614                         BUG_ON(frag->sk);
615                         if (skb->sk) {
616                                 frag->sk = skb->sk;
617                                 frag->destructor = sock_wfree;
618                         }
619                         skb->truesize -= frag->truesize;
620                 }
621
622                 err = 0;
623                 offset = 0;
624                 /* BUILD HEADER */
625
626                 *prevhdr = NEXTHDR_FRAGMENT;
627                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
628                 if (!tmp_hdr) {
629                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
630                                       IPSTATS_MIB_FRAGFAILS);
631                         err = -ENOMEM;
632                         goto fail;
633                 }
634                 frag = skb_shinfo(skb)->frag_list;
635                 skb_frag_list_init(skb);
636
637                 __skb_pull(skb, hlen);
638                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
639                 __skb_push(skb, hlen);
640                 skb_reset_network_header(skb);
641                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
642
643                 fh->nexthdr = nexthdr;
644                 fh->reserved = 0;
645                 fh->frag_off = htons(IP6_MF);
646                 fh->identification = frag_id;
647
648                 first_len = skb_pagelen(skb);
649                 skb->data_len = first_len - skb_headlen(skb);
650                 skb->len = first_len;
651                 ipv6_hdr(skb)->payload_len = htons(first_len -
652                                                    sizeof(struct ipv6hdr));
653
654                 dst_hold(&rt->dst);
655
656                 for (;;) {
657                         /* Prepare header of the next frame,
658                          * before previous one went down. */
659                         if (frag) {
660                                 frag->ip_summed = CHECKSUM_NONE;
661                                 skb_reset_transport_header(frag);
662                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
663                                 __skb_push(frag, hlen);
664                                 skb_reset_network_header(frag);
665                                 memcpy(skb_network_header(frag), tmp_hdr,
666                                        hlen);
667                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
668                                 fh->nexthdr = nexthdr;
669                                 fh->reserved = 0;
670                                 fh->frag_off = htons(offset);
671                                 if (frag->next)
672                                         fh->frag_off |= htons(IP6_MF);
673                                 fh->identification = frag_id;
674                                 ipv6_hdr(frag)->payload_len =
675                                                 htons(frag->len -
676                                                       sizeof(struct ipv6hdr));
677                                 ip6_copy_metadata(frag, skb);
678                         }
679
680                         err = output(sk, skb);
681                         if (!err)
682                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
683                                               IPSTATS_MIB_FRAGCREATES);
684
685                         if (err || !frag)
686                                 break;
687
688                         skb = frag;
689                         frag = skb->next;
690                         skb->next = NULL;
691                 }
692
693                 kfree(tmp_hdr);
694
695                 if (err == 0) {
696                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
697                                       IPSTATS_MIB_FRAGOKS);
698                         ip6_rt_put(rt);
699                         return 0;
700                 }
701
702                 kfree_skb_list(frag);
703
704                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
705                               IPSTATS_MIB_FRAGFAILS);
706                 ip6_rt_put(rt);
707                 return err;
708
709 slow_path_clean:
710                 skb_walk_frags(skb, frag2) {
711                         if (frag2 == frag)
712                                 break;
713                         frag2->sk = NULL;
714                         frag2->destructor = NULL;
715                         skb->truesize += frag2->truesize;
716                 }
717         }
718
719 slow_path:
720         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
721             skb_checksum_help(skb))
722                 goto fail;
723
724         left = skb->len - hlen;         /* Space per frame */
725         ptr = hlen;                     /* Where to start from */
726
727         /*
728          *      Fragment the datagram.
729          */
730
731         *prevhdr = NEXTHDR_FRAGMENT;
732         troom = rt->dst.dev->needed_tailroom;
733
734         /*
735          *      Keep copying data until we run out.
736          */
737         while (left > 0)        {
738                 len = left;
739                 /* IF: it doesn't fit, use 'mtu' - the data space left */
740                 if (len > mtu)
741                         len = mtu;
742                 /* IF: we are not sending up to and including the packet end
743                    then align the next start on an eight byte boundary */
744                 if (len < left) {
745                         len &= ~7;
746                 }
747
748                 /* Allocate buffer */
749                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
750                                  hroom + troom, GFP_ATOMIC);
751                 if (!frag) {
752                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
753                                       IPSTATS_MIB_FRAGFAILS);
754                         err = -ENOMEM;
755                         goto fail;
756                 }
757
758                 /*
759                  *      Set up data on packet
760                  */
761
762                 ip6_copy_metadata(frag, skb);
763                 skb_reserve(frag, hroom);
764                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
765                 skb_reset_network_header(frag);
766                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
767                 frag->transport_header = (frag->network_header + hlen +
768                                           sizeof(struct frag_hdr));
769
770                 /*
771                  *      Charge the memory for the fragment to any owner
772                  *      it might possess
773                  */
774                 if (skb->sk)
775                         skb_set_owner_w(frag, skb->sk);
776
777                 /*
778                  *      Copy the packet header into the new buffer.
779                  */
780                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
781
782                 /*
783                  *      Build fragment header.
784                  */
785                 fh->nexthdr = nexthdr;
786                 fh->reserved = 0;
787                 fh->identification = frag_id;
788
789                 /*
790                  *      Copy a block of the IP datagram.
791                  */
792                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
793                                      len));
794                 left -= len;
795
796                 fh->frag_off = htons(offset);
797                 if (left > 0)
798                         fh->frag_off |= htons(IP6_MF);
799                 ipv6_hdr(frag)->payload_len = htons(frag->len -
800                                                     sizeof(struct ipv6hdr));
801
802                 ptr += len;
803                 offset += len;
804
805                 /*
806                  *      Put this fragment into the sending queue.
807                  */
808                 err = output(sk, frag);
809                 if (err)
810                         goto fail;
811
812                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
813                               IPSTATS_MIB_FRAGCREATES);
814         }
815         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
816                       IPSTATS_MIB_FRAGOKS);
817         consume_skb(skb);
818         return err;
819
820 fail_toobig:
821         if (skb->sk && dst_allfrag(skb_dst(skb)))
822                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
823
824         skb->dev = skb_dst(skb)->dev;
825         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
826         err = -EMSGSIZE;
827
828 fail:
829         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830                       IPSTATS_MIB_FRAGFAILS);
831         kfree_skb(skb);
832         return err;
833 }
834
835 static inline int ip6_rt_check(const struct rt6key *rt_key,
836                                const struct in6_addr *fl_addr,
837                                const struct in6_addr *addr_cache)
838 {
839         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
840                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
841 }
842
843 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
844                                           struct dst_entry *dst,
845                                           const struct flowi6 *fl6)
846 {
847         struct ipv6_pinfo *np = inet6_sk(sk);
848         struct rt6_info *rt;
849
850         if (!dst)
851                 goto out;
852
853         if (dst->ops->family != AF_INET6) {
854                 dst_release(dst);
855                 return NULL;
856         }
857
858         rt = (struct rt6_info *)dst;
859         /* Yes, checking route validity in not connected
860          * case is not very simple. Take into account,
861          * that we do not support routing by source, TOS,
862          * and MSG_DONTROUTE            --ANK (980726)
863          *
864          * 1. ip6_rt_check(): If route was host route,
865          *    check that cached destination is current.
866          *    If it is network route, we still may
867          *    check its validity using saved pointer
868          *    to the last used address: daddr_cache.
869          *    We do not want to save whole address now,
870          *    (because main consumer of this service
871          *    is tcp, which has not this problem),
872          *    so that the last trick works only on connected
873          *    sockets.
874          * 2. oif also should be the same.
875          */
876         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
877 #ifdef CONFIG_IPV6_SUBTREES
878             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
879 #endif
880             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
881                 dst_release(dst);
882                 dst = NULL;
883         }
884
885 out:
886         return dst;
887 }
888
889 static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
890                                struct dst_entry **dst, struct flowi6 *fl6)
891 {
892 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
893         struct neighbour *n;
894         struct rt6_info *rt;
895 #endif
896         int err;
897
898         /* The correct way to handle this would be to do
899          * ip6_route_get_saddr, and then ip6_route_output; however,
900          * the route-specific preferred source forces the
901          * ip6_route_output call _before_ ip6_route_get_saddr.
902          *
903          * In source specific routing (no src=any default route),
904          * ip6_route_output will fail given src=any saddr, though, so
905          * that's why we try it again later.
906          */
907         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
908                 struct rt6_info *rt;
909                 bool had_dst = *dst != NULL;
910
911                 if (!had_dst)
912                         *dst = ip6_route_output(net, sk, fl6);
913                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
914                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
915                                           sk ? inet6_sk(sk)->srcprefs : 0,
916                                           &fl6->saddr);
917                 if (err)
918                         goto out_err_release;
919
920                 /* If we had an erroneous initial result, pretend it
921                  * never existed and let the SA-enabled version take
922                  * over.
923                  */
924                 if (!had_dst && (*dst)->error) {
925                         dst_release(*dst);
926                         *dst = NULL;
927                 }
928         }
929
930         if (!*dst)
931                 *dst = ip6_route_output(net, sk, fl6);
932
933         err = (*dst)->error;
934         if (err)
935                 goto out_err_release;
936
937 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
938         /*
939          * Here if the dst entry we've looked up
940          * has a neighbour entry that is in the INCOMPLETE
941          * state and the src address from the flow is
942          * marked as OPTIMISTIC, we release the found
943          * dst entry and replace it instead with the
944          * dst entry of the nexthop router
945          */
946         rt = (struct rt6_info *) *dst;
947         rcu_read_lock_bh();
948         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
949                                       rt6_nexthop(rt, &fl6->daddr));
950         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
951         rcu_read_unlock_bh();
952
953         if (err) {
954                 struct inet6_ifaddr *ifp;
955                 struct flowi6 fl_gw6;
956                 int redirect;
957
958                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
959                                       (*dst)->dev, 1);
960
961                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
962                 if (ifp)
963                         in6_ifa_put(ifp);
964
965                 if (redirect) {
966                         /*
967                          * We need to get the dst entry for the
968                          * default router instead
969                          */
970                         dst_release(*dst);
971                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
972                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
973                         *dst = ip6_route_output(net, sk, &fl_gw6);
974                         err = (*dst)->error;
975                         if (err)
976                                 goto out_err_release;
977                 }
978         }
979 #endif
980
981         return 0;
982
983 out_err_release:
984         if (err == -ENETUNREACH)
985                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
986         dst_release(*dst);
987         *dst = NULL;
988         return err;
989 }
990
991 /**
992  *      ip6_dst_lookup - perform route lookup on flow
993  *      @sk: socket which provides route info
994  *      @dst: pointer to dst_entry * for result
995  *      @fl6: flow to lookup
996  *
997  *      This function performs a route lookup on the given flow.
998  *
999  *      It returns zero on success, or a standard errno code on error.
1000  */
1001 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1002                    struct flowi6 *fl6)
1003 {
1004         *dst = NULL;
1005         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1006 }
1007 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1008
1009 /**
1010  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1011  *      @sk: socket which provides route info
1012  *      @fl6: flow to lookup
1013  *      @final_dst: final destination address for ipsec lookup
1014  *
1015  *      This function performs a route lookup on the given flow.
1016  *
1017  *      It returns a valid dst pointer on success, or a pointer encoded
1018  *      error code.
1019  */
1020 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1021                                       const struct in6_addr *final_dst)
1022 {
1023         struct dst_entry *dst = NULL;
1024         int err;
1025
1026         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1027         if (err)
1028                 return ERR_PTR(err);
1029         if (final_dst)
1030                 fl6->daddr = *final_dst;
1031         if (!fl6->flowi6_oif)
1032                 fl6->flowi6_oif = dst->dev->ifindex;
1033
1034         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1035 }
1036 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1037
1038 /**
1039  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1040  *      @sk: socket which provides the dst cache and route info
1041  *      @fl6: flow to lookup
1042  *      @final_dst: final destination address for ipsec lookup
1043  *
1044  *      This function performs a route lookup on the given flow with the
1045  *      possibility of using the cached route in the socket if it is valid.
1046  *      It will take the socket dst lock when operating on the dst cache.
1047  *      As a result, this function can only be used in process context.
1048  *
1049  *      It returns a valid dst pointer on success, or a pointer encoded
1050  *      error code.
1051  */
1052 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1053                                          const struct in6_addr *final_dst)
1054 {
1055         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1056         int err;
1057
1058         dst = ip6_sk_dst_check(sk, dst, fl6);
1059
1060         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1061         if (err)
1062                 return ERR_PTR(err);
1063         if (final_dst)
1064                 fl6->daddr = *final_dst;
1065
1066         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1067 }
1068 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1069
1070 static inline int ip6_ufo_append_data(struct sock *sk,
1071                         struct sk_buff_head *queue,
1072                         int getfrag(void *from, char *to, int offset, int len,
1073                         int odd, struct sk_buff *skb),
1074                         void *from, int length, int hh_len, int fragheaderlen,
1075                         int transhdrlen, int mtu, unsigned int flags,
1076                         const struct flowi6 *fl6)
1077
1078 {
1079         struct sk_buff *skb;
1080         int err;
1081
1082         /* There is support for UDP large send offload by network
1083          * device, so create one single skb packet containing complete
1084          * udp datagram
1085          */
1086         skb = skb_peek_tail(queue);
1087         if (!skb) {
1088                 skb = sock_alloc_send_skb(sk,
1089                         hh_len + fragheaderlen + transhdrlen + 20,
1090                         (flags & MSG_DONTWAIT), &err);
1091                 if (!skb)
1092                         return err;
1093
1094                 /* reserve space for Hardware header */
1095                 skb_reserve(skb, hh_len);
1096
1097                 /* create space for UDP/IP header */
1098                 skb_put(skb, fragheaderlen + transhdrlen);
1099
1100                 /* initialize network header pointer */
1101                 skb_reset_network_header(skb);
1102
1103                 /* initialize protocol header pointer */
1104                 skb->transport_header = skb->network_header + fragheaderlen;
1105
1106                 skb->protocol = htons(ETH_P_IPV6);
1107                 skb->csum = 0;
1108
1109                 __skb_queue_tail(queue, skb);
1110         } else if (skb_is_gso(skb)) {
1111                 goto append;
1112         }
1113
1114         skb->ip_summed = CHECKSUM_PARTIAL;
1115         /* Specify the length of each IPv6 datagram fragment.
1116          * It has to be a multiple of 8.
1117          */
1118         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1119                                      sizeof(struct frag_hdr)) & ~7;
1120         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1121         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1122                                                          &fl6->daddr,
1123                                                          &fl6->saddr);
1124
1125 append:
1126         return skb_append_datato_frags(sk, skb, getfrag, from,
1127                                        (length - transhdrlen));
1128 }
1129
1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131                                                gfp_t gfp)
1132 {
1133         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134 }
1135
1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137                                                 gfp_t gfp)
1138 {
1139         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140 }
1141
1142 static void ip6_append_data_mtu(unsigned int *mtu,
1143                                 int *maxfraglen,
1144                                 unsigned int fragheaderlen,
1145                                 struct sk_buff *skb,
1146                                 struct rt6_info *rt,
1147                                 unsigned int orig_mtu)
1148 {
1149         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150                 if (!skb) {
1151                         /* first fragment, reserve header_len */
1152                         *mtu = orig_mtu - rt->dst.header_len;
1153
1154                 } else {
1155                         /*
1156                          * this fragment is not first, the headers
1157                          * space is regarded as data space.
1158                          */
1159                         *mtu = orig_mtu;
1160                 }
1161                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162                               + fragheaderlen - sizeof(struct frag_hdr);
1163         }
1164 }
1165
1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167                           struct inet6_cork *v6_cork,
1168                           int hlimit, int tclass, struct ipv6_txoptions *opt,
1169                           struct rt6_info *rt, struct flowi6 *fl6)
1170 {
1171         struct ipv6_pinfo *np = inet6_sk(sk);
1172         unsigned int mtu;
1173
1174         /*
1175          * setup for corking
1176          */
1177         if (opt) {
1178                 if (WARN_ON(v6_cork->opt))
1179                         return -EINVAL;
1180
1181                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1182                 if (unlikely(!v6_cork->opt))
1183                         return -ENOBUFS;
1184
1185                 v6_cork->opt->tot_len = opt->tot_len;
1186                 v6_cork->opt->opt_flen = opt->opt_flen;
1187                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1188
1189                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190                                                     sk->sk_allocation);
1191                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192                         return -ENOBUFS;
1193
1194                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195                                                     sk->sk_allocation);
1196                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197                         return -ENOBUFS;
1198
1199                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200                                                    sk->sk_allocation);
1201                 if (opt->hopopt && !v6_cork->opt->hopopt)
1202                         return -ENOBUFS;
1203
1204                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205                                                     sk->sk_allocation);
1206                 if (opt->srcrt && !v6_cork->opt->srcrt)
1207                         return -ENOBUFS;
1208
1209                 /* need source address above miyazawa*/
1210         }
1211         dst_hold(&rt->dst);
1212         cork->base.dst = &rt->dst;
1213         cork->fl.u.ip6 = *fl6;
1214         v6_cork->hop_limit = hlimit;
1215         v6_cork->tclass = tclass;
1216         if (rt->dst.flags & DST_XFRM_TUNNEL)
1217                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1219         else
1220                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1222         if (np->frag_size < mtu) {
1223                 if (np->frag_size)
1224                         mtu = np->frag_size;
1225         }
1226         cork->base.fragsize = mtu;
1227         if (dst_allfrag(rt->dst.path))
1228                 cork->base.flags |= IPCORK_ALLFRAG;
1229         cork->base.length = 0;
1230
1231         return 0;
1232 }
1233
1234 static int __ip6_append_data(struct sock *sk,
1235                              struct flowi6 *fl6,
1236                              struct sk_buff_head *queue,
1237                              struct inet_cork *cork,
1238                              struct inet6_cork *v6_cork,
1239                              struct page_frag *pfrag,
1240                              int getfrag(void *from, char *to, int offset,
1241                                          int len, int odd, struct sk_buff *skb),
1242                              void *from, int length, int transhdrlen,
1243                              unsigned int flags, int dontfrag)
1244 {
1245         struct sk_buff *skb, *skb_prev = NULL;
1246         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1247         int exthdrlen = 0;
1248         int dst_exthdrlen = 0;
1249         int hh_len;
1250         int copy;
1251         int err;
1252         int offset = 0;
1253         __u8 tx_flags = 0;
1254         u32 tskey = 0;
1255         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1256         struct ipv6_txoptions *opt = v6_cork->opt;
1257         int csummode = CHECKSUM_NONE;
1258
1259         skb = skb_peek_tail(queue);
1260         if (!skb) {
1261                 exthdrlen = opt ? opt->opt_flen : 0;
1262                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1263         }
1264
1265         mtu = cork->fragsize;
1266         orig_mtu = mtu;
1267
1268         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1269
1270         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1271                         (opt ? opt->opt_nflen : 0);
1272         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1273                      sizeof(struct frag_hdr);
1274
1275         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1276                 unsigned int maxnonfragsize, headersize;
1277
1278                 headersize = sizeof(struct ipv6hdr) +
1279                              (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1280                              (dst_allfrag(&rt->dst) ?
1281                               sizeof(struct frag_hdr) : 0) +
1282                              rt->rt6i_nfheader_len;
1283
1284                 if (ip6_sk_ignore_df(sk))
1285                         maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1286                 else
1287                         maxnonfragsize = mtu;
1288
1289                 /* dontfrag active */
1290                 if ((cork->length + length > mtu - headersize) && dontfrag &&
1291                     (sk->sk_protocol == IPPROTO_UDP ||
1292                      sk->sk_protocol == IPPROTO_RAW)) {
1293                         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1294                                                    sizeof(struct ipv6hdr));
1295                         goto emsgsize;
1296                 }
1297
1298                 if (cork->length + length > maxnonfragsize - headersize) {
1299 emsgsize:
1300                         ipv6_local_error(sk, EMSGSIZE, fl6,
1301                                          mtu - headersize +
1302                                          sizeof(struct ipv6hdr));
1303                         return -EMSGSIZE;
1304                 }
1305         }
1306
1307         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1308                 sock_tx_timestamp(sk, &tx_flags);
1309                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1310                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1311                         tskey = sk->sk_tskey++;
1312         }
1313
1314         /* If this is the first and only packet and device
1315          * supports checksum offloading, let's use it.
1316          * Use transhdrlen, same as IPv4, because partial
1317          * sums only work when transhdrlen is set.
1318          */
1319         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1320             length + fragheaderlen < mtu &&
1321             rt->dst.dev->features & NETIF_F_V6_CSUM &&
1322             !exthdrlen)
1323                 csummode = CHECKSUM_PARTIAL;
1324         /*
1325          * Let's try using as much space as possible.
1326          * Use MTU if total length of the message fits into the MTU.
1327          * Otherwise, we need to reserve fragment header and
1328          * fragment alignment (= 8-15 octects, in total).
1329          *
1330          * Note that we may need to "move" the data from the tail of
1331          * of the buffer to the new fragment when we split
1332          * the message.
1333          *
1334          * FIXME: It may be fragmented into multiple chunks
1335          *        at once if non-fragmentable extension headers
1336          *        are too large.
1337          * --yoshfuji
1338          */
1339
1340         cork->length += length;
1341         if (((length > mtu) ||
1342              (skb && skb_is_gso(skb))) &&
1343             (sk->sk_protocol == IPPROTO_UDP) &&
1344             (rt->dst.dev->features & NETIF_F_UFO) &&
1345             (sk->sk_type == SOCK_DGRAM)) {
1346                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1347                                           hh_len, fragheaderlen,
1348                                           transhdrlen, mtu, flags, fl6);
1349                 if (err)
1350                         goto error;
1351                 return 0;
1352         }
1353
1354         if (!skb)
1355                 goto alloc_new_skb;
1356
1357         while (length > 0) {
1358                 /* Check if the remaining data fits into current packet. */
1359                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1360                 if (copy < length)
1361                         copy = maxfraglen - skb->len;
1362
1363                 if (copy <= 0) {
1364                         char *data;
1365                         unsigned int datalen;
1366                         unsigned int fraglen;
1367                         unsigned int fraggap;
1368                         unsigned int alloclen;
1369 alloc_new_skb:
1370                         /* There's no room in the current skb */
1371                         if (skb)
1372                                 fraggap = skb->len - maxfraglen;
1373                         else
1374                                 fraggap = 0;
1375                         /* update mtu and maxfraglen if necessary */
1376                         if (!skb || !skb_prev)
1377                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1378                                                     fragheaderlen, skb, rt,
1379                                                     orig_mtu);
1380
1381                         skb_prev = skb;
1382
1383                         /*
1384                          * If remaining data exceeds the mtu,
1385                          * we know we need more fragment(s).
1386                          */
1387                         datalen = length + fraggap;
1388
1389                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1390                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1391                         if ((flags & MSG_MORE) &&
1392                             !(rt->dst.dev->features&NETIF_F_SG))
1393                                 alloclen = mtu;
1394                         else
1395                                 alloclen = datalen + fragheaderlen;
1396
1397                         alloclen += dst_exthdrlen;
1398
1399                         if (datalen != length + fraggap) {
1400                                 /*
1401                                  * this is not the last fragment, the trailer
1402                                  * space is regarded as data space.
1403                                  */
1404                                 datalen += rt->dst.trailer_len;
1405                         }
1406
1407                         alloclen += rt->dst.trailer_len;
1408                         fraglen = datalen + fragheaderlen;
1409
1410                         /*
1411                          * We just reserve space for fragment header.
1412                          * Note: this may be overallocation if the message
1413                          * (without MSG_MORE) fits into the MTU.
1414                          */
1415                         alloclen += sizeof(struct frag_hdr);
1416
1417                         if (transhdrlen) {
1418                                 skb = sock_alloc_send_skb(sk,
1419                                                 alloclen + hh_len,
1420                                                 (flags & MSG_DONTWAIT), &err);
1421                         } else {
1422                                 skb = NULL;
1423                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1424                                     2 * sk->sk_sndbuf)
1425                                         skb = sock_wmalloc(sk,
1426                                                            alloclen + hh_len, 1,
1427                                                            sk->sk_allocation);
1428                                 if (unlikely(!skb))
1429                                         err = -ENOBUFS;
1430                         }
1431                         if (!skb)
1432                                 goto error;
1433                         /*
1434                          *      Fill in the control structures
1435                          */
1436                         skb->protocol = htons(ETH_P_IPV6);
1437                         skb->ip_summed = csummode;
1438                         skb->csum = 0;
1439                         /* reserve for fragmentation and ipsec header */
1440                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1441                                     dst_exthdrlen);
1442
1443                         /* Only the initial fragment is time stamped */
1444                         skb_shinfo(skb)->tx_flags = tx_flags;
1445                         tx_flags = 0;
1446                         skb_shinfo(skb)->tskey = tskey;
1447                         tskey = 0;
1448
1449                         /*
1450                          *      Find where to start putting bytes
1451                          */
1452                         data = skb_put(skb, fraglen);
1453                         skb_set_network_header(skb, exthdrlen);
1454                         data += fragheaderlen;
1455                         skb->transport_header = (skb->network_header +
1456                                                  fragheaderlen);
1457                         if (fraggap) {
1458                                 skb->csum = skb_copy_and_csum_bits(
1459                                         skb_prev, maxfraglen,
1460                                         data + transhdrlen, fraggap, 0);
1461                                 skb_prev->csum = csum_sub(skb_prev->csum,
1462                                                           skb->csum);
1463                                 data += fraggap;
1464                                 pskb_trim_unique(skb_prev, maxfraglen);
1465                         }
1466                         copy = datalen - transhdrlen - fraggap;
1467
1468                         if (copy < 0) {
1469                                 err = -EINVAL;
1470                                 kfree_skb(skb);
1471                                 goto error;
1472                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1473                                 err = -EFAULT;
1474                                 kfree_skb(skb);
1475                                 goto error;
1476                         }
1477
1478                         offset += copy;
1479                         length -= datalen - fraggap;
1480                         transhdrlen = 0;
1481                         exthdrlen = 0;
1482                         dst_exthdrlen = 0;
1483
1484                         /*
1485                          * Put the packet on the pending queue
1486                          */
1487                         __skb_queue_tail(queue, skb);
1488                         continue;
1489                 }
1490
1491                 if (copy > length)
1492                         copy = length;
1493
1494                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1495                         unsigned int off;
1496
1497                         off = skb->len;
1498                         if (getfrag(from, skb_put(skb, copy),
1499                                                 offset, copy, off, skb) < 0) {
1500                                 __skb_trim(skb, off);
1501                                 err = -EFAULT;
1502                                 goto error;
1503                         }
1504                 } else {
1505                         int i = skb_shinfo(skb)->nr_frags;
1506
1507                         err = -ENOMEM;
1508                         if (!sk_page_frag_refill(sk, pfrag))
1509                                 goto error;
1510
1511                         if (!skb_can_coalesce(skb, i, pfrag->page,
1512                                               pfrag->offset)) {
1513                                 err = -EMSGSIZE;
1514                                 if (i == MAX_SKB_FRAGS)
1515                                         goto error;
1516
1517                                 __skb_fill_page_desc(skb, i, pfrag->page,
1518                                                      pfrag->offset, 0);
1519                                 skb_shinfo(skb)->nr_frags = ++i;
1520                                 get_page(pfrag->page);
1521                         }
1522                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1523                         if (getfrag(from,
1524                                     page_address(pfrag->page) + pfrag->offset,
1525                                     offset, copy, skb->len, skb) < 0)
1526                                 goto error_efault;
1527
1528                         pfrag->offset += copy;
1529                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1530                         skb->len += copy;
1531                         skb->data_len += copy;
1532                         skb->truesize += copy;
1533                         atomic_add(copy, &sk->sk_wmem_alloc);
1534                 }
1535                 offset += copy;
1536                 length -= copy;
1537         }
1538
1539         return 0;
1540
1541 error_efault:
1542         err = -EFAULT;
1543 error:
1544         cork->length -= length;
1545         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1546         return err;
1547 }
1548
1549 int ip6_append_data(struct sock *sk,
1550                     int getfrag(void *from, char *to, int offset, int len,
1551                                 int odd, struct sk_buff *skb),
1552                     void *from, int length, int transhdrlen, int hlimit,
1553                     int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1554                     struct rt6_info *rt, unsigned int flags, int dontfrag)
1555 {
1556         struct inet_sock *inet = inet_sk(sk);
1557         struct ipv6_pinfo *np = inet6_sk(sk);
1558         int exthdrlen;
1559         int err;
1560
1561         if (flags&MSG_PROBE)
1562                 return 0;
1563         if (skb_queue_empty(&sk->sk_write_queue)) {
1564                 /*
1565                  * setup for corking
1566                  */
1567                 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1568                                      tclass, opt, rt, fl6);
1569                 if (err)
1570                         return err;
1571
1572                 exthdrlen = (opt ? opt->opt_flen : 0);
1573                 length += exthdrlen;
1574                 transhdrlen += exthdrlen;
1575         } else {
1576                 fl6 = &inet->cork.fl.u.ip6;
1577                 transhdrlen = 0;
1578         }
1579
1580         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1581                                  &np->cork, sk_page_frag(sk), getfrag,
1582                                  from, length, transhdrlen, flags, dontfrag);
1583 }
1584 EXPORT_SYMBOL_GPL(ip6_append_data);
1585
1586 static void ip6_cork_release(struct inet_cork_full *cork,
1587                              struct inet6_cork *v6_cork)
1588 {
1589         if (v6_cork->opt) {
1590                 kfree(v6_cork->opt->dst0opt);
1591                 kfree(v6_cork->opt->dst1opt);
1592                 kfree(v6_cork->opt->hopopt);
1593                 kfree(v6_cork->opt->srcrt);
1594                 kfree(v6_cork->opt);
1595                 v6_cork->opt = NULL;
1596         }
1597
1598         if (cork->base.dst) {
1599                 dst_release(cork->base.dst);
1600                 cork->base.dst = NULL;
1601                 cork->base.flags &= ~IPCORK_ALLFRAG;
1602         }
1603         memset(&cork->fl, 0, sizeof(cork->fl));
1604 }
1605
1606 struct sk_buff *__ip6_make_skb(struct sock *sk,
1607                                struct sk_buff_head *queue,
1608                                struct inet_cork_full *cork,
1609                                struct inet6_cork *v6_cork)
1610 {
1611         struct sk_buff *skb, *tmp_skb;
1612         struct sk_buff **tail_skb;
1613         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1614         struct ipv6_pinfo *np = inet6_sk(sk);
1615         struct net *net = sock_net(sk);
1616         struct ipv6hdr *hdr;
1617         struct ipv6_txoptions *opt = v6_cork->opt;
1618         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1619         struct flowi6 *fl6 = &cork->fl.u.ip6;
1620         unsigned char proto = fl6->flowi6_proto;
1621
1622         skb = __skb_dequeue(queue);
1623         if (!skb)
1624                 goto out;
1625         tail_skb = &(skb_shinfo(skb)->frag_list);
1626
1627         /* move skb->data to ip header from ext header */
1628         if (skb->data < skb_network_header(skb))
1629                 __skb_pull(skb, skb_network_offset(skb));
1630         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1631                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1632                 *tail_skb = tmp_skb;
1633                 tail_skb = &(tmp_skb->next);
1634                 skb->len += tmp_skb->len;
1635                 skb->data_len += tmp_skb->len;
1636                 skb->truesize += tmp_skb->truesize;
1637                 tmp_skb->destructor = NULL;
1638                 tmp_skb->sk = NULL;
1639         }
1640
1641         /* Allow local fragmentation. */
1642         skb->ignore_df = ip6_sk_ignore_df(sk);
1643
1644         *final_dst = fl6->daddr;
1645         __skb_pull(skb, skb_network_header_len(skb));
1646         if (opt && opt->opt_flen)
1647                 ipv6_push_frag_opts(skb, opt, &proto);
1648         if (opt && opt->opt_nflen)
1649                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1650
1651         skb_push(skb, sizeof(struct ipv6hdr));
1652         skb_reset_network_header(skb);
1653         hdr = ipv6_hdr(skb);
1654
1655         ip6_flow_hdr(hdr, v6_cork->tclass,
1656                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1657                                         np->autoflowlabel, fl6));
1658         hdr->hop_limit = v6_cork->hop_limit;
1659         hdr->nexthdr = proto;
1660         hdr->saddr = fl6->saddr;
1661         hdr->daddr = *final_dst;
1662
1663         skb->priority = sk->sk_priority;
1664         skb->mark = sk->sk_mark;
1665
1666         skb_dst_set(skb, dst_clone(&rt->dst));
1667         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1668         if (proto == IPPROTO_ICMPV6) {
1669                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1670
1671                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1672                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1673         }
1674
1675         ip6_cork_release(cork, v6_cork);
1676 out:
1677         return skb;
1678 }
1679
1680 int ip6_send_skb(struct sk_buff *skb)
1681 {
1682         struct net *net = sock_net(skb->sk);
1683         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1684         int err;
1685
1686         err = ip6_local_out(skb);
1687         if (err) {
1688                 if (err > 0)
1689                         err = net_xmit_errno(err);
1690                 if (err)
1691                         IP6_INC_STATS(net, rt->rt6i_idev,
1692                                       IPSTATS_MIB_OUTDISCARDS);
1693         }
1694
1695         return err;
1696 }
1697
1698 int ip6_push_pending_frames(struct sock *sk)
1699 {
1700         struct sk_buff *skb;
1701
1702         skb = ip6_finish_skb(sk);
1703         if (!skb)
1704                 return 0;
1705
1706         return ip6_send_skb(skb);
1707 }
1708 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1709
1710 static void __ip6_flush_pending_frames(struct sock *sk,
1711                                        struct sk_buff_head *queue,
1712                                        struct inet_cork_full *cork,
1713                                        struct inet6_cork *v6_cork)
1714 {
1715         struct sk_buff *skb;
1716
1717         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1718                 if (skb_dst(skb))
1719                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1720                                       IPSTATS_MIB_OUTDISCARDS);
1721                 kfree_skb(skb);
1722         }
1723
1724         ip6_cork_release(cork, v6_cork);
1725 }
1726
1727 void ip6_flush_pending_frames(struct sock *sk)
1728 {
1729         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1730                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1731 }
1732 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1733
1734 struct sk_buff *ip6_make_skb(struct sock *sk,
1735                              int getfrag(void *from, char *to, int offset,
1736                                          int len, int odd, struct sk_buff *skb),
1737                              void *from, int length, int transhdrlen,
1738                              int hlimit, int tclass,
1739                              struct ipv6_txoptions *opt, struct flowi6 *fl6,
1740                              struct rt6_info *rt, unsigned int flags,
1741                              int dontfrag)
1742 {
1743         struct inet_cork_full cork;
1744         struct inet6_cork v6_cork;
1745         struct sk_buff_head queue;
1746         int exthdrlen = (opt ? opt->opt_flen : 0);
1747         int err;
1748
1749         if (flags & MSG_PROBE)
1750                 return NULL;
1751
1752         __skb_queue_head_init(&queue);
1753
1754         cork.base.flags = 0;
1755         cork.base.addr = 0;
1756         cork.base.opt = NULL;
1757         v6_cork.opt = NULL;
1758         err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1759         if (err)
1760                 return ERR_PTR(err);
1761
1762         if (dontfrag < 0)
1763                 dontfrag = inet6_sk(sk)->dontfrag;
1764
1765         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1766                                 &current->task_frag, getfrag, from,
1767                                 length + exthdrlen, transhdrlen + exthdrlen,
1768                                 flags, dontfrag);
1769         if (err) {
1770                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1771                 return ERR_PTR(err);
1772         }
1773
1774         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1775 }