]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv6/ip6_output.c
Merge tag 'wireless-drivers-next-for-davem-2015-10-09' of git://git.kernel.org/pub...
[karo-tx-linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
60 {
61         struct dst_entry *dst = skb_dst(skb);
62         struct net_device *dev = dst->dev;
63         struct neighbour *neigh;
64         struct in6_addr *nexthop;
65         int ret;
66
67         skb->protocol = htons(ETH_P_IPV6);
68         skb->dev = dev;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74                     ((mroute6_socket(net, skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         net, sk, newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(net, idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99                     IPV6_ADDR_SCOPE_NODELOCAL &&
100                     !(dev->flags & IFF_LOOPBACK)) {
101                         kfree_skb(skb);
102                         return 0;
103                 }
104         }
105
106         rcu_read_lock_bh();
107         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
108         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
109         if (unlikely(!neigh))
110                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
111         if (!IS_ERR(neigh)) {
112                 ret = dst_neigh_output(dst, neigh, skb);
113                 rcu_read_unlock_bh();
114                 return ret;
115         }
116         rcu_read_unlock_bh();
117
118         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
119         kfree_skb(skb);
120         return -EINVAL;
121 }
122
123 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
124 {
125         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
126             dst_allfrag(skb_dst(skb)) ||
127             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
128                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
129         else
130                 return ip6_finish_output2(net, sk, skb);
131 }
132
133 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
134 {
135         struct net_device *dev = skb_dst(skb)->dev;
136         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
137
138         if (unlikely(idev->cnf.disable_ipv6)) {
139                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
140                 kfree_skb(skb);
141                 return 0;
142         }
143
144         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
145                             net, sk, skb, NULL, dev,
146                             ip6_finish_output,
147                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
148 }
149
150 /*
151  * xmit an sk_buff (used by TCP, SCTP and DCCP)
152  * Note : socket lock is not held for SYNACK packets, but might be modified
153  * by calls to skb_set_owner_w() and ipv6_local_error(),
154  * which are using proper atomic operations or spinlocks.
155  */
156 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157              struct ipv6_txoptions *opt, int tclass)
158 {
159         struct net *net = sock_net(sk);
160         const struct ipv6_pinfo *np = inet6_sk(sk);
161         struct in6_addr *first_hop = &fl6->daddr;
162         struct dst_entry *dst = skb_dst(skb);
163         struct ipv6hdr *hdr;
164         u8  proto = fl6->flowi6_proto;
165         int seg_len = skb->len;
166         int hlimit = -1;
167         u32 mtu;
168
169         if (opt) {
170                 unsigned int head_room;
171
172                 /* First: exthdrs may take lots of space (~8K for now)
173                    MAX_HEADER is not enough.
174                  */
175                 head_room = opt->opt_nflen + opt->opt_flen;
176                 seg_len += head_room;
177                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178
179                 if (skb_headroom(skb) < head_room) {
180                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181                         if (!skb2) {
182                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183                                               IPSTATS_MIB_OUTDISCARDS);
184                                 kfree_skb(skb);
185                                 return -ENOBUFS;
186                         }
187                         consume_skb(skb);
188                         skb = skb2;
189                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
190                          * it is safe to call in our context (socket lock not held)
191                          */
192                         skb_set_owner_w(skb, (struct sock *)sk);
193                 }
194                 if (opt->opt_flen)
195                         ipv6_push_frag_opts(skb, opt, &proto);
196                 if (opt->opt_nflen)
197                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198         }
199
200         skb_push(skb, sizeof(struct ipv6hdr));
201         skb_reset_network_header(skb);
202         hdr = ipv6_hdr(skb);
203
204         /*
205          *      Fill in the IPv6 header
206          */
207         if (np)
208                 hlimit = np->hop_limit;
209         if (hlimit < 0)
210                 hlimit = ip6_dst_hoplimit(dst);
211
212         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
213                                                      np->autoflowlabel, fl6));
214
215         hdr->payload_len = htons(seg_len);
216         hdr->nexthdr = proto;
217         hdr->hop_limit = hlimit;
218
219         hdr->saddr = fl6->saddr;
220         hdr->daddr = *first_hop;
221
222         skb->protocol = htons(ETH_P_IPV6);
223         skb->priority = sk->sk_priority;
224         skb->mark = sk->sk_mark;
225
226         mtu = dst_mtu(dst);
227         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
228                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
229                               IPSTATS_MIB_OUT, skb->len);
230                 /* hooks should never assume socket lock is held.
231                  * we promote our socket to non const
232                  */
233                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
234                                net, (struct sock *)sk, skb, NULL, dst->dev,
235                                dst_output);
236         }
237
238         skb->dev = dst->dev;
239         /* ipv6_local_error() does not require socket lock,
240          * we promote our socket to non const
241          */
242         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
243
244         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
245         kfree_skb(skb);
246         return -EMSGSIZE;
247 }
248 EXPORT_SYMBOL(ip6_xmit);
249
250 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
251 {
252         struct ip6_ra_chain *ra;
253         struct sock *last = NULL;
254
255         read_lock(&ip6_ra_lock);
256         for (ra = ip6_ra_chain; ra; ra = ra->next) {
257                 struct sock *sk = ra->sk;
258                 if (sk && ra->sel == sel &&
259                     (!sk->sk_bound_dev_if ||
260                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
261                         if (last) {
262                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
263                                 if (skb2)
264                                         rawv6_rcv(last, skb2);
265                         }
266                         last = sk;
267                 }
268         }
269
270         if (last) {
271                 rawv6_rcv(last, skb);
272                 read_unlock(&ip6_ra_lock);
273                 return 1;
274         }
275         read_unlock(&ip6_ra_lock);
276         return 0;
277 }
278
279 static int ip6_forward_proxy_check(struct sk_buff *skb)
280 {
281         struct ipv6hdr *hdr = ipv6_hdr(skb);
282         u8 nexthdr = hdr->nexthdr;
283         __be16 frag_off;
284         int offset;
285
286         if (ipv6_ext_hdr(nexthdr)) {
287                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
288                 if (offset < 0)
289                         return 0;
290         } else
291                 offset = sizeof(struct ipv6hdr);
292
293         if (nexthdr == IPPROTO_ICMPV6) {
294                 struct icmp6hdr *icmp6;
295
296                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
297                                          offset + 1 - skb->data)))
298                         return 0;
299
300                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
301
302                 switch (icmp6->icmp6_type) {
303                 case NDISC_ROUTER_SOLICITATION:
304                 case NDISC_ROUTER_ADVERTISEMENT:
305                 case NDISC_NEIGHBOUR_SOLICITATION:
306                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
307                 case NDISC_REDIRECT:
308                         /* For reaction involving unicast neighbor discovery
309                          * message destined to the proxied address, pass it to
310                          * input function.
311                          */
312                         return 1;
313                 default:
314                         break;
315                 }
316         }
317
318         /*
319          * The proxying router can't forward traffic sent to a link-local
320          * address, so signal the sender and discard the packet. This
321          * behavior is clarified by the MIPv6 specification.
322          */
323         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
324                 dst_link_failure(skb);
325                 return -1;
326         }
327
328         return 0;
329 }
330
331 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
332                                      struct sk_buff *skb)
333 {
334         skb_sender_cpu_clear(skb);
335         return dst_output(net, sk, skb);
336 }
337
338 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
339 {
340         unsigned int mtu;
341         struct inet6_dev *idev;
342
343         if (dst_metric_locked(dst, RTAX_MTU)) {
344                 mtu = dst_metric_raw(dst, RTAX_MTU);
345                 if (mtu)
346                         return mtu;
347         }
348
349         mtu = IPV6_MIN_MTU;
350         rcu_read_lock();
351         idev = __in6_dev_get(dst->dev);
352         if (idev)
353                 mtu = idev->cnf.mtu6;
354         rcu_read_unlock();
355
356         return mtu;
357 }
358
359 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
360 {
361         if (skb->len <= mtu)
362                 return false;
363
364         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
365         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
366                 return true;
367
368         if (skb->ignore_df)
369                 return false;
370
371         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
372                 return false;
373
374         return true;
375 }
376
377 int ip6_forward(struct sk_buff *skb)
378 {
379         struct dst_entry *dst = skb_dst(skb);
380         struct ipv6hdr *hdr = ipv6_hdr(skb);
381         struct inet6_skb_parm *opt = IP6CB(skb);
382         struct net *net = dev_net(dst->dev);
383         u32 mtu;
384
385         if (net->ipv6.devconf_all->forwarding == 0)
386                 goto error;
387
388         if (skb->pkt_type != PACKET_HOST)
389                 goto drop;
390
391         if (skb_warn_if_lro(skb))
392                 goto drop;
393
394         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
395                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
396                                  IPSTATS_MIB_INDISCARDS);
397                 goto drop;
398         }
399
400         skb_forward_csum(skb);
401
402         /*
403          *      We DO NOT make any processing on
404          *      RA packets, pushing them to user level AS IS
405          *      without ane WARRANTY that application will be able
406          *      to interpret them. The reason is that we
407          *      cannot make anything clever here.
408          *
409          *      We are not end-node, so that if packet contains
410          *      AH/ESP, we cannot make anything.
411          *      Defragmentation also would be mistake, RA packets
412          *      cannot be fragmented, because there is no warranty
413          *      that different fragments will go along one path. --ANK
414          */
415         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
416                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
417                         return 0;
418         }
419
420         /*
421          *      check and decrement ttl
422          */
423         if (hdr->hop_limit <= 1) {
424                 /* Force OUTPUT device used as source address */
425                 skb->dev = dst->dev;
426                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
427                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
428                                  IPSTATS_MIB_INHDRERRORS);
429
430                 kfree_skb(skb);
431                 return -ETIMEDOUT;
432         }
433
434         /* XXX: idev->cnf.proxy_ndp? */
435         if (net->ipv6.devconf_all->proxy_ndp &&
436             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
437                 int proxied = ip6_forward_proxy_check(skb);
438                 if (proxied > 0)
439                         return ip6_input(skb);
440                 else if (proxied < 0) {
441                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
442                                          IPSTATS_MIB_INDISCARDS);
443                         goto drop;
444                 }
445         }
446
447         if (!xfrm6_route_forward(skb)) {
448                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
449                                  IPSTATS_MIB_INDISCARDS);
450                 goto drop;
451         }
452         dst = skb_dst(skb);
453
454         /* IPv6 specs say nothing about it, but it is clear that we cannot
455            send redirects to source routed frames.
456            We don't send redirects to frames decapsulated from IPsec.
457          */
458         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
459                 struct in6_addr *target = NULL;
460                 struct inet_peer *peer;
461                 struct rt6_info *rt;
462
463                 /*
464                  *      incoming and outgoing devices are the same
465                  *      send a redirect.
466                  */
467
468                 rt = (struct rt6_info *) dst;
469                 if (rt->rt6i_flags & RTF_GATEWAY)
470                         target = &rt->rt6i_gateway;
471                 else
472                         target = &hdr->daddr;
473
474                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
475
476                 /* Limit redirects both by destination (here)
477                    and by source (inside ndisc_send_redirect)
478                  */
479                 if (inet_peer_xrlim_allow(peer, 1*HZ))
480                         ndisc_send_redirect(skb, target);
481                 if (peer)
482                         inet_putpeer(peer);
483         } else {
484                 int addrtype = ipv6_addr_type(&hdr->saddr);
485
486                 /* This check is security critical. */
487                 if (addrtype == IPV6_ADDR_ANY ||
488                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
489                         goto error;
490                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
491                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
492                                     ICMPV6_NOT_NEIGHBOUR, 0);
493                         goto error;
494                 }
495         }
496
497         mtu = ip6_dst_mtu_forward(dst);
498         if (mtu < IPV6_MIN_MTU)
499                 mtu = IPV6_MIN_MTU;
500
501         if (ip6_pkt_too_big(skb, mtu)) {
502                 /* Again, force OUTPUT device used as source address */
503                 skb->dev = dst->dev;
504                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
505                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
506                                  IPSTATS_MIB_INTOOBIGERRORS);
507                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
508                                  IPSTATS_MIB_FRAGFAILS);
509                 kfree_skb(skb);
510                 return -EMSGSIZE;
511         }
512
513         if (skb_cow(skb, dst->dev->hard_header_len)) {
514                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
515                                  IPSTATS_MIB_OUTDISCARDS);
516                 goto drop;
517         }
518
519         hdr = ipv6_hdr(skb);
520
521         /* Mangling hops number delayed to point after skb COW */
522
523         hdr->hop_limit--;
524
525         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
526         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
527         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
528                        net, NULL, skb, skb->dev, dst->dev,
529                        ip6_forward_finish);
530
531 error:
532         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
533 drop:
534         kfree_skb(skb);
535         return -EINVAL;
536 }
537
538 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
539 {
540         to->pkt_type = from->pkt_type;
541         to->priority = from->priority;
542         to->protocol = from->protocol;
543         skb_dst_drop(to);
544         skb_dst_set(to, dst_clone(skb_dst(from)));
545         to->dev = from->dev;
546         to->mark = from->mark;
547
548 #ifdef CONFIG_NET_SCHED
549         to->tc_index = from->tc_index;
550 #endif
551         nf_copy(to, from);
552         skb_copy_secmark(to, from);
553 }
554
555 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
556                  int (*output)(struct net *, struct sock *, struct sk_buff *))
557 {
558         struct sk_buff *frag;
559         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
560         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
561                                 inet6_sk(skb->sk) : NULL;
562         struct ipv6hdr *tmp_hdr;
563         struct frag_hdr *fh;
564         unsigned int mtu, hlen, left, len;
565         int hroom, troom;
566         __be32 frag_id;
567         int ptr, offset = 0, err = 0;
568         u8 *prevhdr, nexthdr = 0;
569
570         hlen = ip6_find_1stfragopt(skb, &prevhdr);
571         nexthdr = *prevhdr;
572
573         mtu = ip6_skb_dst_mtu(skb);
574
575         /* We must not fragment if the socket is set to force MTU discovery
576          * or if the skb it not generated by a local socket.
577          */
578         if (unlikely(!skb->ignore_df && skb->len > mtu))
579                 goto fail_toobig;
580
581         if (IP6CB(skb)->frag_max_size) {
582                 if (IP6CB(skb)->frag_max_size > mtu)
583                         goto fail_toobig;
584
585                 /* don't send fragments larger than what we received */
586                 mtu = IP6CB(skb)->frag_max_size;
587                 if (mtu < IPV6_MIN_MTU)
588                         mtu = IPV6_MIN_MTU;
589         }
590
591         if (np && np->frag_size < mtu) {
592                 if (np->frag_size)
593                         mtu = np->frag_size;
594         }
595         mtu -= hlen + sizeof(struct frag_hdr);
596
597         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
598                                     &ipv6_hdr(skb)->saddr);
599
600         hroom = LL_RESERVED_SPACE(rt->dst.dev);
601         if (skb_has_frag_list(skb)) {
602                 int first_len = skb_pagelen(skb);
603                 struct sk_buff *frag2;
604
605                 if (first_len - hlen > mtu ||
606                     ((first_len - hlen) & 7) ||
607                     skb_cloned(skb) ||
608                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
609                         goto slow_path;
610
611                 skb_walk_frags(skb, frag) {
612                         /* Correct geometry. */
613                         if (frag->len > mtu ||
614                             ((frag->len & 7) && frag->next) ||
615                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
616                                 goto slow_path_clean;
617
618                         /* Partially cloned skb? */
619                         if (skb_shared(frag))
620                                 goto slow_path_clean;
621
622                         BUG_ON(frag->sk);
623                         if (skb->sk) {
624                                 frag->sk = skb->sk;
625                                 frag->destructor = sock_wfree;
626                         }
627                         skb->truesize -= frag->truesize;
628                 }
629
630                 err = 0;
631                 offset = 0;
632                 /* BUILD HEADER */
633
634                 *prevhdr = NEXTHDR_FRAGMENT;
635                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
636                 if (!tmp_hdr) {
637                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
638                                       IPSTATS_MIB_FRAGFAILS);
639                         err = -ENOMEM;
640                         goto fail;
641                 }
642                 frag = skb_shinfo(skb)->frag_list;
643                 skb_frag_list_init(skb);
644
645                 __skb_pull(skb, hlen);
646                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
647                 __skb_push(skb, hlen);
648                 skb_reset_network_header(skb);
649                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
650
651                 fh->nexthdr = nexthdr;
652                 fh->reserved = 0;
653                 fh->frag_off = htons(IP6_MF);
654                 fh->identification = frag_id;
655
656                 first_len = skb_pagelen(skb);
657                 skb->data_len = first_len - skb_headlen(skb);
658                 skb->len = first_len;
659                 ipv6_hdr(skb)->payload_len = htons(first_len -
660                                                    sizeof(struct ipv6hdr));
661
662                 dst_hold(&rt->dst);
663
664                 for (;;) {
665                         /* Prepare header of the next frame,
666                          * before previous one went down. */
667                         if (frag) {
668                                 frag->ip_summed = CHECKSUM_NONE;
669                                 skb_reset_transport_header(frag);
670                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
671                                 __skb_push(frag, hlen);
672                                 skb_reset_network_header(frag);
673                                 memcpy(skb_network_header(frag), tmp_hdr,
674                                        hlen);
675                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
676                                 fh->nexthdr = nexthdr;
677                                 fh->reserved = 0;
678                                 fh->frag_off = htons(offset);
679                                 if (frag->next)
680                                         fh->frag_off |= htons(IP6_MF);
681                                 fh->identification = frag_id;
682                                 ipv6_hdr(frag)->payload_len =
683                                                 htons(frag->len -
684                                                       sizeof(struct ipv6hdr));
685                                 ip6_copy_metadata(frag, skb);
686                         }
687
688                         err = output(net, sk, skb);
689                         if (!err)
690                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
691                                               IPSTATS_MIB_FRAGCREATES);
692
693                         if (err || !frag)
694                                 break;
695
696                         skb = frag;
697                         frag = skb->next;
698                         skb->next = NULL;
699                 }
700
701                 kfree(tmp_hdr);
702
703                 if (err == 0) {
704                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
705                                       IPSTATS_MIB_FRAGOKS);
706                         ip6_rt_put(rt);
707                         return 0;
708                 }
709
710                 kfree_skb_list(frag);
711
712                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
713                               IPSTATS_MIB_FRAGFAILS);
714                 ip6_rt_put(rt);
715                 return err;
716
717 slow_path_clean:
718                 skb_walk_frags(skb, frag2) {
719                         if (frag2 == frag)
720                                 break;
721                         frag2->sk = NULL;
722                         frag2->destructor = NULL;
723                         skb->truesize += frag2->truesize;
724                 }
725         }
726
727 slow_path:
728         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
729             skb_checksum_help(skb))
730                 goto fail;
731
732         left = skb->len - hlen;         /* Space per frame */
733         ptr = hlen;                     /* Where to start from */
734
735         /*
736          *      Fragment the datagram.
737          */
738
739         *prevhdr = NEXTHDR_FRAGMENT;
740         troom = rt->dst.dev->needed_tailroom;
741
742         /*
743          *      Keep copying data until we run out.
744          */
745         while (left > 0)        {
746                 len = left;
747                 /* IF: it doesn't fit, use 'mtu' - the data space left */
748                 if (len > mtu)
749                         len = mtu;
750                 /* IF: we are not sending up to and including the packet end
751                    then align the next start on an eight byte boundary */
752                 if (len < left) {
753                         len &= ~7;
754                 }
755
756                 /* Allocate buffer */
757                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
758                                  hroom + troom, GFP_ATOMIC);
759                 if (!frag) {
760                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
761                                       IPSTATS_MIB_FRAGFAILS);
762                         err = -ENOMEM;
763                         goto fail;
764                 }
765
766                 /*
767                  *      Set up data on packet
768                  */
769
770                 ip6_copy_metadata(frag, skb);
771                 skb_reserve(frag, hroom);
772                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
773                 skb_reset_network_header(frag);
774                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
775                 frag->transport_header = (frag->network_header + hlen +
776                                           sizeof(struct frag_hdr));
777
778                 /*
779                  *      Charge the memory for the fragment to any owner
780                  *      it might possess
781                  */
782                 if (skb->sk)
783                         skb_set_owner_w(frag, skb->sk);
784
785                 /*
786                  *      Copy the packet header into the new buffer.
787                  */
788                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
789
790                 /*
791                  *      Build fragment header.
792                  */
793                 fh->nexthdr = nexthdr;
794                 fh->reserved = 0;
795                 fh->identification = frag_id;
796
797                 /*
798                  *      Copy a block of the IP datagram.
799                  */
800                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
801                                      len));
802                 left -= len;
803
804                 fh->frag_off = htons(offset);
805                 if (left > 0)
806                         fh->frag_off |= htons(IP6_MF);
807                 ipv6_hdr(frag)->payload_len = htons(frag->len -
808                                                     sizeof(struct ipv6hdr));
809
810                 ptr += len;
811                 offset += len;
812
813                 /*
814                  *      Put this fragment into the sending queue.
815                  */
816                 err = output(net, sk, frag);
817                 if (err)
818                         goto fail;
819
820                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821                               IPSTATS_MIB_FRAGCREATES);
822         }
823         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
824                       IPSTATS_MIB_FRAGOKS);
825         consume_skb(skb);
826         return err;
827
828 fail_toobig:
829         if (skb->sk && dst_allfrag(skb_dst(skb)))
830                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
831
832         skb->dev = skb_dst(skb)->dev;
833         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
834         err = -EMSGSIZE;
835
836 fail:
837         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
838                       IPSTATS_MIB_FRAGFAILS);
839         kfree_skb(skb);
840         return err;
841 }
842
843 static inline int ip6_rt_check(const struct rt6key *rt_key,
844                                const struct in6_addr *fl_addr,
845                                const struct in6_addr *addr_cache)
846 {
847         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
848                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
849 }
850
851 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
852                                           struct dst_entry *dst,
853                                           const struct flowi6 *fl6)
854 {
855         struct ipv6_pinfo *np = inet6_sk(sk);
856         struct rt6_info *rt;
857
858         if (!dst)
859                 goto out;
860
861         if (dst->ops->family != AF_INET6) {
862                 dst_release(dst);
863                 return NULL;
864         }
865
866         rt = (struct rt6_info *)dst;
867         /* Yes, checking route validity in not connected
868          * case is not very simple. Take into account,
869          * that we do not support routing by source, TOS,
870          * and MSG_DONTROUTE            --ANK (980726)
871          *
872          * 1. ip6_rt_check(): If route was host route,
873          *    check that cached destination is current.
874          *    If it is network route, we still may
875          *    check its validity using saved pointer
876          *    to the last used address: daddr_cache.
877          *    We do not want to save whole address now,
878          *    (because main consumer of this service
879          *    is tcp, which has not this problem),
880          *    so that the last trick works only on connected
881          *    sockets.
882          * 2. oif also should be the same.
883          */
884         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
885 #ifdef CONFIG_IPV6_SUBTREES
886             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
887 #endif
888             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
889                 dst_release(dst);
890                 dst = NULL;
891         }
892
893 out:
894         return dst;
895 }
896
897 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
898                                struct dst_entry **dst, struct flowi6 *fl6)
899 {
900 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
901         struct neighbour *n;
902         struct rt6_info *rt;
903 #endif
904         int err;
905
906         /* The correct way to handle this would be to do
907          * ip6_route_get_saddr, and then ip6_route_output; however,
908          * the route-specific preferred source forces the
909          * ip6_route_output call _before_ ip6_route_get_saddr.
910          *
911          * In source specific routing (no src=any default route),
912          * ip6_route_output will fail given src=any saddr, though, so
913          * that's why we try it again later.
914          */
915         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
916                 struct rt6_info *rt;
917                 bool had_dst = *dst != NULL;
918
919                 if (!had_dst)
920                         *dst = ip6_route_output(net, sk, fl6);
921                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
922                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
923                                           sk ? inet6_sk(sk)->srcprefs : 0,
924                                           &fl6->saddr);
925                 if (err)
926                         goto out_err_release;
927
928                 /* If we had an erroneous initial result, pretend it
929                  * never existed and let the SA-enabled version take
930                  * over.
931                  */
932                 if (!had_dst && (*dst)->error) {
933                         dst_release(*dst);
934                         *dst = NULL;
935                 }
936         }
937
938         if (!*dst)
939                 *dst = ip6_route_output(net, sk, fl6);
940
941         err = (*dst)->error;
942         if (err)
943                 goto out_err_release;
944
945 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
946         /*
947          * Here if the dst entry we've looked up
948          * has a neighbour entry that is in the INCOMPLETE
949          * state and the src address from the flow is
950          * marked as OPTIMISTIC, we release the found
951          * dst entry and replace it instead with the
952          * dst entry of the nexthop router
953          */
954         rt = (struct rt6_info *) *dst;
955         rcu_read_lock_bh();
956         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
957                                       rt6_nexthop(rt, &fl6->daddr));
958         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
959         rcu_read_unlock_bh();
960
961         if (err) {
962                 struct inet6_ifaddr *ifp;
963                 struct flowi6 fl_gw6;
964                 int redirect;
965
966                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
967                                       (*dst)->dev, 1);
968
969                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
970                 if (ifp)
971                         in6_ifa_put(ifp);
972
973                 if (redirect) {
974                         /*
975                          * We need to get the dst entry for the
976                          * default router instead
977                          */
978                         dst_release(*dst);
979                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
980                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
981                         *dst = ip6_route_output(net, sk, &fl_gw6);
982                         err = (*dst)->error;
983                         if (err)
984                                 goto out_err_release;
985                 }
986         }
987 #endif
988
989         return 0;
990
991 out_err_release:
992         if (err == -ENETUNREACH)
993                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
994         dst_release(*dst);
995         *dst = NULL;
996         return err;
997 }
998
999 /**
1000  *      ip6_dst_lookup - perform route lookup on flow
1001  *      @sk: socket which provides route info
1002  *      @dst: pointer to dst_entry * for result
1003  *      @fl6: flow to lookup
1004  *
1005  *      This function performs a route lookup on the given flow.
1006  *
1007  *      It returns zero on success, or a standard errno code on error.
1008  */
1009 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1010                    struct flowi6 *fl6)
1011 {
1012         *dst = NULL;
1013         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1014 }
1015 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1016
1017 /**
1018  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1019  *      @sk: socket which provides route info
1020  *      @fl6: flow to lookup
1021  *      @final_dst: final destination address for ipsec lookup
1022  *
1023  *      This function performs a route lookup on the given flow.
1024  *
1025  *      It returns a valid dst pointer on success, or a pointer encoded
1026  *      error code.
1027  */
1028 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1029                                       const struct in6_addr *final_dst)
1030 {
1031         struct dst_entry *dst = NULL;
1032         int err;
1033
1034         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1035         if (err)
1036                 return ERR_PTR(err);
1037         if (final_dst)
1038                 fl6->daddr = *final_dst;
1039         if (!fl6->flowi6_oif)
1040                 fl6->flowi6_oif = dst->dev->ifindex;
1041
1042         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1043 }
1044 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1045
1046 /**
1047  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1048  *      @sk: socket which provides the dst cache and route info
1049  *      @fl6: flow to lookup
1050  *      @final_dst: final destination address for ipsec lookup
1051  *
1052  *      This function performs a route lookup on the given flow with the
1053  *      possibility of using the cached route in the socket if it is valid.
1054  *      It will take the socket dst lock when operating on the dst cache.
1055  *      As a result, this function can only be used in process context.
1056  *
1057  *      It returns a valid dst pointer on success, or a pointer encoded
1058  *      error code.
1059  */
1060 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1061                                          const struct in6_addr *final_dst)
1062 {
1063         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1064         int err;
1065
1066         dst = ip6_sk_dst_check(sk, dst, fl6);
1067
1068         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1069         if (err)
1070                 return ERR_PTR(err);
1071         if (final_dst)
1072                 fl6->daddr = *final_dst;
1073
1074         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1075 }
1076 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1077
1078 static inline int ip6_ufo_append_data(struct sock *sk,
1079                         struct sk_buff_head *queue,
1080                         int getfrag(void *from, char *to, int offset, int len,
1081                         int odd, struct sk_buff *skb),
1082                         void *from, int length, int hh_len, int fragheaderlen,
1083                         int transhdrlen, int mtu, unsigned int flags,
1084                         const struct flowi6 *fl6)
1085
1086 {
1087         struct sk_buff *skb;
1088         int err;
1089
1090         /* There is support for UDP large send offload by network
1091          * device, so create one single skb packet containing complete
1092          * udp datagram
1093          */
1094         skb = skb_peek_tail(queue);
1095         if (!skb) {
1096                 skb = sock_alloc_send_skb(sk,
1097                         hh_len + fragheaderlen + transhdrlen + 20,
1098                         (flags & MSG_DONTWAIT), &err);
1099                 if (!skb)
1100                         return err;
1101
1102                 /* reserve space for Hardware header */
1103                 skb_reserve(skb, hh_len);
1104
1105                 /* create space for UDP/IP header */
1106                 skb_put(skb, fragheaderlen + transhdrlen);
1107
1108                 /* initialize network header pointer */
1109                 skb_reset_network_header(skb);
1110
1111                 /* initialize protocol header pointer */
1112                 skb->transport_header = skb->network_header + fragheaderlen;
1113
1114                 skb->protocol = htons(ETH_P_IPV6);
1115                 skb->csum = 0;
1116
1117                 __skb_queue_tail(queue, skb);
1118         } else if (skb_is_gso(skb)) {
1119                 goto append;
1120         }
1121
1122         skb->ip_summed = CHECKSUM_PARTIAL;
1123         /* Specify the length of each IPv6 datagram fragment.
1124          * It has to be a multiple of 8.
1125          */
1126         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1127                                      sizeof(struct frag_hdr)) & ~7;
1128         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1129         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1130                                                          &fl6->daddr,
1131                                                          &fl6->saddr);
1132
1133 append:
1134         return skb_append_datato_frags(sk, skb, getfrag, from,
1135                                        (length - transhdrlen));
1136 }
1137
1138 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1139                                                gfp_t gfp)
1140 {
1141         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1142 }
1143
1144 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1145                                                 gfp_t gfp)
1146 {
1147         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1148 }
1149
1150 static void ip6_append_data_mtu(unsigned int *mtu,
1151                                 int *maxfraglen,
1152                                 unsigned int fragheaderlen,
1153                                 struct sk_buff *skb,
1154                                 struct rt6_info *rt,
1155                                 unsigned int orig_mtu)
1156 {
1157         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1158                 if (!skb) {
1159                         /* first fragment, reserve header_len */
1160                         *mtu = orig_mtu - rt->dst.header_len;
1161
1162                 } else {
1163                         /*
1164                          * this fragment is not first, the headers
1165                          * space is regarded as data space.
1166                          */
1167                         *mtu = orig_mtu;
1168                 }
1169                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1170                               + fragheaderlen - sizeof(struct frag_hdr);
1171         }
1172 }
1173
1174 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1175                           struct inet6_cork *v6_cork,
1176                           int hlimit, int tclass, struct ipv6_txoptions *opt,
1177                           struct rt6_info *rt, struct flowi6 *fl6)
1178 {
1179         struct ipv6_pinfo *np = inet6_sk(sk);
1180         unsigned int mtu;
1181
1182         /*
1183          * setup for corking
1184          */
1185         if (opt) {
1186                 if (WARN_ON(v6_cork->opt))
1187                         return -EINVAL;
1188
1189                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1190                 if (unlikely(!v6_cork->opt))
1191                         return -ENOBUFS;
1192
1193                 v6_cork->opt->tot_len = opt->tot_len;
1194                 v6_cork->opt->opt_flen = opt->opt_flen;
1195                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1196
1197                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1198                                                     sk->sk_allocation);
1199                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1200                         return -ENOBUFS;
1201
1202                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1203                                                     sk->sk_allocation);
1204                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1205                         return -ENOBUFS;
1206
1207                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1208                                                    sk->sk_allocation);
1209                 if (opt->hopopt && !v6_cork->opt->hopopt)
1210                         return -ENOBUFS;
1211
1212                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1213                                                     sk->sk_allocation);
1214                 if (opt->srcrt && !v6_cork->opt->srcrt)
1215                         return -ENOBUFS;
1216
1217                 /* need source address above miyazawa*/
1218         }
1219         dst_hold(&rt->dst);
1220         cork->base.dst = &rt->dst;
1221         cork->fl.u.ip6 = *fl6;
1222         v6_cork->hop_limit = hlimit;
1223         v6_cork->tclass = tclass;
1224         if (rt->dst.flags & DST_XFRM_TUNNEL)
1225                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1226                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1227         else
1228                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1229                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1230         if (np->frag_size < mtu) {
1231                 if (np->frag_size)
1232                         mtu = np->frag_size;
1233         }
1234         cork->base.fragsize = mtu;
1235         if (dst_allfrag(rt->dst.path))
1236                 cork->base.flags |= IPCORK_ALLFRAG;
1237         cork->base.length = 0;
1238
1239         return 0;
1240 }
1241
1242 static int __ip6_append_data(struct sock *sk,
1243                              struct flowi6 *fl6,
1244                              struct sk_buff_head *queue,
1245                              struct inet_cork *cork,
1246                              struct inet6_cork *v6_cork,
1247                              struct page_frag *pfrag,
1248                              int getfrag(void *from, char *to, int offset,
1249                                          int len, int odd, struct sk_buff *skb),
1250                              void *from, int length, int transhdrlen,
1251                              unsigned int flags, int dontfrag)
1252 {
1253         struct sk_buff *skb, *skb_prev = NULL;
1254         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1255         int exthdrlen = 0;
1256         int dst_exthdrlen = 0;
1257         int hh_len;
1258         int copy;
1259         int err;
1260         int offset = 0;
1261         __u8 tx_flags = 0;
1262         u32 tskey = 0;
1263         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1264         struct ipv6_txoptions *opt = v6_cork->opt;
1265         int csummode = CHECKSUM_NONE;
1266
1267         skb = skb_peek_tail(queue);
1268         if (!skb) {
1269                 exthdrlen = opt ? opt->opt_flen : 0;
1270                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1271         }
1272
1273         mtu = cork->fragsize;
1274         orig_mtu = mtu;
1275
1276         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1277
1278         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1279                         (opt ? opt->opt_nflen : 0);
1280         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1281                      sizeof(struct frag_hdr);
1282
1283         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1284                 unsigned int maxnonfragsize, headersize;
1285
1286                 headersize = sizeof(struct ipv6hdr) +
1287                              (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1288                              (dst_allfrag(&rt->dst) ?
1289                               sizeof(struct frag_hdr) : 0) +
1290                              rt->rt6i_nfheader_len;
1291
1292                 if (ip6_sk_ignore_df(sk))
1293                         maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1294                 else
1295                         maxnonfragsize = mtu;
1296
1297                 /* dontfrag active */
1298                 if ((cork->length + length > mtu - headersize) && dontfrag &&
1299                     (sk->sk_protocol == IPPROTO_UDP ||
1300                      sk->sk_protocol == IPPROTO_RAW)) {
1301                         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1302                                                    sizeof(struct ipv6hdr));
1303                         goto emsgsize;
1304                 }
1305
1306                 if (cork->length + length > maxnonfragsize - headersize) {
1307 emsgsize:
1308                         ipv6_local_error(sk, EMSGSIZE, fl6,
1309                                          mtu - headersize +
1310                                          sizeof(struct ipv6hdr));
1311                         return -EMSGSIZE;
1312                 }
1313         }
1314
1315         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1316                 sock_tx_timestamp(sk, &tx_flags);
1317                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1318                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1319                         tskey = sk->sk_tskey++;
1320         }
1321
1322         /* If this is the first and only packet and device
1323          * supports checksum offloading, let's use it.
1324          * Use transhdrlen, same as IPv4, because partial
1325          * sums only work when transhdrlen is set.
1326          */
1327         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1328             length + fragheaderlen < mtu &&
1329             rt->dst.dev->features & NETIF_F_V6_CSUM &&
1330             !exthdrlen)
1331                 csummode = CHECKSUM_PARTIAL;
1332         /*
1333          * Let's try using as much space as possible.
1334          * Use MTU if total length of the message fits into the MTU.
1335          * Otherwise, we need to reserve fragment header and
1336          * fragment alignment (= 8-15 octects, in total).
1337          *
1338          * Note that we may need to "move" the data from the tail of
1339          * of the buffer to the new fragment when we split
1340          * the message.
1341          *
1342          * FIXME: It may be fragmented into multiple chunks
1343          *        at once if non-fragmentable extension headers
1344          *        are too large.
1345          * --yoshfuji
1346          */
1347
1348         cork->length += length;
1349         if (((length > mtu) ||
1350              (skb && skb_is_gso(skb))) &&
1351             (sk->sk_protocol == IPPROTO_UDP) &&
1352             (rt->dst.dev->features & NETIF_F_UFO) &&
1353             (sk->sk_type == SOCK_DGRAM)) {
1354                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1355                                           hh_len, fragheaderlen,
1356                                           transhdrlen, mtu, flags, fl6);
1357                 if (err)
1358                         goto error;
1359                 return 0;
1360         }
1361
1362         if (!skb)
1363                 goto alloc_new_skb;
1364
1365         while (length > 0) {
1366                 /* Check if the remaining data fits into current packet. */
1367                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1368                 if (copy < length)
1369                         copy = maxfraglen - skb->len;
1370
1371                 if (copy <= 0) {
1372                         char *data;
1373                         unsigned int datalen;
1374                         unsigned int fraglen;
1375                         unsigned int fraggap;
1376                         unsigned int alloclen;
1377 alloc_new_skb:
1378                         /* There's no room in the current skb */
1379                         if (skb)
1380                                 fraggap = skb->len - maxfraglen;
1381                         else
1382                                 fraggap = 0;
1383                         /* update mtu and maxfraglen if necessary */
1384                         if (!skb || !skb_prev)
1385                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1386                                                     fragheaderlen, skb, rt,
1387                                                     orig_mtu);
1388
1389                         skb_prev = skb;
1390
1391                         /*
1392                          * If remaining data exceeds the mtu,
1393                          * we know we need more fragment(s).
1394                          */
1395                         datalen = length + fraggap;
1396
1397                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1398                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1399                         if ((flags & MSG_MORE) &&
1400                             !(rt->dst.dev->features&NETIF_F_SG))
1401                                 alloclen = mtu;
1402                         else
1403                                 alloclen = datalen + fragheaderlen;
1404
1405                         alloclen += dst_exthdrlen;
1406
1407                         if (datalen != length + fraggap) {
1408                                 /*
1409                                  * this is not the last fragment, the trailer
1410                                  * space is regarded as data space.
1411                                  */
1412                                 datalen += rt->dst.trailer_len;
1413                         }
1414
1415                         alloclen += rt->dst.trailer_len;
1416                         fraglen = datalen + fragheaderlen;
1417
1418                         /*
1419                          * We just reserve space for fragment header.
1420                          * Note: this may be overallocation if the message
1421                          * (without MSG_MORE) fits into the MTU.
1422                          */
1423                         alloclen += sizeof(struct frag_hdr);
1424
1425                         if (transhdrlen) {
1426                                 skb = sock_alloc_send_skb(sk,
1427                                                 alloclen + hh_len,
1428                                                 (flags & MSG_DONTWAIT), &err);
1429                         } else {
1430                                 skb = NULL;
1431                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1432                                     2 * sk->sk_sndbuf)
1433                                         skb = sock_wmalloc(sk,
1434                                                            alloclen + hh_len, 1,
1435                                                            sk->sk_allocation);
1436                                 if (unlikely(!skb))
1437                                         err = -ENOBUFS;
1438                         }
1439                         if (!skb)
1440                                 goto error;
1441                         /*
1442                          *      Fill in the control structures
1443                          */
1444                         skb->protocol = htons(ETH_P_IPV6);
1445                         skb->ip_summed = csummode;
1446                         skb->csum = 0;
1447                         /* reserve for fragmentation and ipsec header */
1448                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1449                                     dst_exthdrlen);
1450
1451                         /* Only the initial fragment is time stamped */
1452                         skb_shinfo(skb)->tx_flags = tx_flags;
1453                         tx_flags = 0;
1454                         skb_shinfo(skb)->tskey = tskey;
1455                         tskey = 0;
1456
1457                         /*
1458                          *      Find where to start putting bytes
1459                          */
1460                         data = skb_put(skb, fraglen);
1461                         skb_set_network_header(skb, exthdrlen);
1462                         data += fragheaderlen;
1463                         skb->transport_header = (skb->network_header +
1464                                                  fragheaderlen);
1465                         if (fraggap) {
1466                                 skb->csum = skb_copy_and_csum_bits(
1467                                         skb_prev, maxfraglen,
1468                                         data + transhdrlen, fraggap, 0);
1469                                 skb_prev->csum = csum_sub(skb_prev->csum,
1470                                                           skb->csum);
1471                                 data += fraggap;
1472                                 pskb_trim_unique(skb_prev, maxfraglen);
1473                         }
1474                         copy = datalen - transhdrlen - fraggap;
1475
1476                         if (copy < 0) {
1477                                 err = -EINVAL;
1478                                 kfree_skb(skb);
1479                                 goto error;
1480                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1481                                 err = -EFAULT;
1482                                 kfree_skb(skb);
1483                                 goto error;
1484                         }
1485
1486                         offset += copy;
1487                         length -= datalen - fraggap;
1488                         transhdrlen = 0;
1489                         exthdrlen = 0;
1490                         dst_exthdrlen = 0;
1491
1492                         /*
1493                          * Put the packet on the pending queue
1494                          */
1495                         __skb_queue_tail(queue, skb);
1496                         continue;
1497                 }
1498
1499                 if (copy > length)
1500                         copy = length;
1501
1502                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1503                         unsigned int off;
1504
1505                         off = skb->len;
1506                         if (getfrag(from, skb_put(skb, copy),
1507                                                 offset, copy, off, skb) < 0) {
1508                                 __skb_trim(skb, off);
1509                                 err = -EFAULT;
1510                                 goto error;
1511                         }
1512                 } else {
1513                         int i = skb_shinfo(skb)->nr_frags;
1514
1515                         err = -ENOMEM;
1516                         if (!sk_page_frag_refill(sk, pfrag))
1517                                 goto error;
1518
1519                         if (!skb_can_coalesce(skb, i, pfrag->page,
1520                                               pfrag->offset)) {
1521                                 err = -EMSGSIZE;
1522                                 if (i == MAX_SKB_FRAGS)
1523                                         goto error;
1524
1525                                 __skb_fill_page_desc(skb, i, pfrag->page,
1526                                                      pfrag->offset, 0);
1527                                 skb_shinfo(skb)->nr_frags = ++i;
1528                                 get_page(pfrag->page);
1529                         }
1530                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1531                         if (getfrag(from,
1532                                     page_address(pfrag->page) + pfrag->offset,
1533                                     offset, copy, skb->len, skb) < 0)
1534                                 goto error_efault;
1535
1536                         pfrag->offset += copy;
1537                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1538                         skb->len += copy;
1539                         skb->data_len += copy;
1540                         skb->truesize += copy;
1541                         atomic_add(copy, &sk->sk_wmem_alloc);
1542                 }
1543                 offset += copy;
1544                 length -= copy;
1545         }
1546
1547         return 0;
1548
1549 error_efault:
1550         err = -EFAULT;
1551 error:
1552         cork->length -= length;
1553         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1554         return err;
1555 }
1556
1557 int ip6_append_data(struct sock *sk,
1558                     int getfrag(void *from, char *to, int offset, int len,
1559                                 int odd, struct sk_buff *skb),
1560                     void *from, int length, int transhdrlen, int hlimit,
1561                     int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1562                     struct rt6_info *rt, unsigned int flags, int dontfrag)
1563 {
1564         struct inet_sock *inet = inet_sk(sk);
1565         struct ipv6_pinfo *np = inet6_sk(sk);
1566         int exthdrlen;
1567         int err;
1568
1569         if (flags&MSG_PROBE)
1570                 return 0;
1571         if (skb_queue_empty(&sk->sk_write_queue)) {
1572                 /*
1573                  * setup for corking
1574                  */
1575                 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1576                                      tclass, opt, rt, fl6);
1577                 if (err)
1578                         return err;
1579
1580                 exthdrlen = (opt ? opt->opt_flen : 0);
1581                 length += exthdrlen;
1582                 transhdrlen += exthdrlen;
1583         } else {
1584                 fl6 = &inet->cork.fl.u.ip6;
1585                 transhdrlen = 0;
1586         }
1587
1588         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1589                                  &np->cork, sk_page_frag(sk), getfrag,
1590                                  from, length, transhdrlen, flags, dontfrag);
1591 }
1592 EXPORT_SYMBOL_GPL(ip6_append_data);
1593
1594 static void ip6_cork_release(struct inet_cork_full *cork,
1595                              struct inet6_cork *v6_cork)
1596 {
1597         if (v6_cork->opt) {
1598                 kfree(v6_cork->opt->dst0opt);
1599                 kfree(v6_cork->opt->dst1opt);
1600                 kfree(v6_cork->opt->hopopt);
1601                 kfree(v6_cork->opt->srcrt);
1602                 kfree(v6_cork->opt);
1603                 v6_cork->opt = NULL;
1604         }
1605
1606         if (cork->base.dst) {
1607                 dst_release(cork->base.dst);
1608                 cork->base.dst = NULL;
1609                 cork->base.flags &= ~IPCORK_ALLFRAG;
1610         }
1611         memset(&cork->fl, 0, sizeof(cork->fl));
1612 }
1613
1614 struct sk_buff *__ip6_make_skb(struct sock *sk,
1615                                struct sk_buff_head *queue,
1616                                struct inet_cork_full *cork,
1617                                struct inet6_cork *v6_cork)
1618 {
1619         struct sk_buff *skb, *tmp_skb;
1620         struct sk_buff **tail_skb;
1621         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1622         struct ipv6_pinfo *np = inet6_sk(sk);
1623         struct net *net = sock_net(sk);
1624         struct ipv6hdr *hdr;
1625         struct ipv6_txoptions *opt = v6_cork->opt;
1626         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1627         struct flowi6 *fl6 = &cork->fl.u.ip6;
1628         unsigned char proto = fl6->flowi6_proto;
1629
1630         skb = __skb_dequeue(queue);
1631         if (!skb)
1632                 goto out;
1633         tail_skb = &(skb_shinfo(skb)->frag_list);
1634
1635         /* move skb->data to ip header from ext header */
1636         if (skb->data < skb_network_header(skb))
1637                 __skb_pull(skb, skb_network_offset(skb));
1638         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1639                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1640                 *tail_skb = tmp_skb;
1641                 tail_skb = &(tmp_skb->next);
1642                 skb->len += tmp_skb->len;
1643                 skb->data_len += tmp_skb->len;
1644                 skb->truesize += tmp_skb->truesize;
1645                 tmp_skb->destructor = NULL;
1646                 tmp_skb->sk = NULL;
1647         }
1648
1649         /* Allow local fragmentation. */
1650         skb->ignore_df = ip6_sk_ignore_df(sk);
1651
1652         *final_dst = fl6->daddr;
1653         __skb_pull(skb, skb_network_header_len(skb));
1654         if (opt && opt->opt_flen)
1655                 ipv6_push_frag_opts(skb, opt, &proto);
1656         if (opt && opt->opt_nflen)
1657                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1658
1659         skb_push(skb, sizeof(struct ipv6hdr));
1660         skb_reset_network_header(skb);
1661         hdr = ipv6_hdr(skb);
1662
1663         ip6_flow_hdr(hdr, v6_cork->tclass,
1664                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1665                                         np->autoflowlabel, fl6));
1666         hdr->hop_limit = v6_cork->hop_limit;
1667         hdr->nexthdr = proto;
1668         hdr->saddr = fl6->saddr;
1669         hdr->daddr = *final_dst;
1670
1671         skb->priority = sk->sk_priority;
1672         skb->mark = sk->sk_mark;
1673
1674         skb_dst_set(skb, dst_clone(&rt->dst));
1675         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1676         if (proto == IPPROTO_ICMPV6) {
1677                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1678
1679                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1680                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1681         }
1682
1683         ip6_cork_release(cork, v6_cork);
1684 out:
1685         return skb;
1686 }
1687
1688 int ip6_send_skb(struct sk_buff *skb)
1689 {
1690         struct net *net = sock_net(skb->sk);
1691         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1692         int err;
1693
1694         err = ip6_local_out(net, skb->sk, skb);
1695         if (err) {
1696                 if (err > 0)
1697                         err = net_xmit_errno(err);
1698                 if (err)
1699                         IP6_INC_STATS(net, rt->rt6i_idev,
1700                                       IPSTATS_MIB_OUTDISCARDS);
1701         }
1702
1703         return err;
1704 }
1705
1706 int ip6_push_pending_frames(struct sock *sk)
1707 {
1708         struct sk_buff *skb;
1709
1710         skb = ip6_finish_skb(sk);
1711         if (!skb)
1712                 return 0;
1713
1714         return ip6_send_skb(skb);
1715 }
1716 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1717
1718 static void __ip6_flush_pending_frames(struct sock *sk,
1719                                        struct sk_buff_head *queue,
1720                                        struct inet_cork_full *cork,
1721                                        struct inet6_cork *v6_cork)
1722 {
1723         struct sk_buff *skb;
1724
1725         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1726                 if (skb_dst(skb))
1727                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1728                                       IPSTATS_MIB_OUTDISCARDS);
1729                 kfree_skb(skb);
1730         }
1731
1732         ip6_cork_release(cork, v6_cork);
1733 }
1734
1735 void ip6_flush_pending_frames(struct sock *sk)
1736 {
1737         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1738                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1739 }
1740 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1741
1742 struct sk_buff *ip6_make_skb(struct sock *sk,
1743                              int getfrag(void *from, char *to, int offset,
1744                                          int len, int odd, struct sk_buff *skb),
1745                              void *from, int length, int transhdrlen,
1746                              int hlimit, int tclass,
1747                              struct ipv6_txoptions *opt, struct flowi6 *fl6,
1748                              struct rt6_info *rt, unsigned int flags,
1749                              int dontfrag)
1750 {
1751         struct inet_cork_full cork;
1752         struct inet6_cork v6_cork;
1753         struct sk_buff_head queue;
1754         int exthdrlen = (opt ? opt->opt_flen : 0);
1755         int err;
1756
1757         if (flags & MSG_PROBE)
1758                 return NULL;
1759
1760         __skb_queue_head_init(&queue);
1761
1762         cork.base.flags = 0;
1763         cork.base.addr = 0;
1764         cork.base.opt = NULL;
1765         v6_cork.opt = NULL;
1766         err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1767         if (err)
1768                 return ERR_PTR(err);
1769
1770         if (dontfrag < 0)
1771                 dontfrag = inet6_sk(sk)->dontfrag;
1772
1773         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1774                                 &current->task_frag, getfrag, from,
1775                                 length + exthdrlen, transhdrlen + exthdrlen,
1776                                 flags, dontfrag);
1777         if (err) {
1778                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1779                 return ERR_PTR(err);
1780         }
1781
1782         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1783 }