]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv6/ip6_output.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[karo-tx-linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int __ip6_local_out(struct sk_buff *skb)
60 {
61         int len;
62
63         len = skb->len - sizeof(struct ipv6hdr);
64         if (len > IPV6_MAXPLEN)
65                 len = 0;
66         ipv6_hdr(skb)->payload_len = htons(len);
67
68         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
69                        skb_dst(skb)->dev, dst_output);
70 }
71
72 int ip6_local_out(struct sk_buff *skb)
73 {
74         int err;
75
76         err = __ip6_local_out(skb);
77         if (likely(err == 1))
78                 err = dst_output(skb);
79
80         return err;
81 }
82 EXPORT_SYMBOL_GPL(ip6_local_out);
83
84 static int ip6_finish_output2(struct sk_buff *skb)
85 {
86         struct dst_entry *dst = skb_dst(skb);
87         struct net_device *dev = dst->dev;
88         struct neighbour *neigh;
89         struct in6_addr *nexthop;
90         int ret;
91
92         skb->protocol = htons(ETH_P_IPV6);
93         skb->dev = dev;
94
95         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
96                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
97
98                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
99                     ((mroute6_socket(dev_net(dev), skb) &&
100                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
101                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
102                                          &ipv6_hdr(skb)->saddr))) {
103                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
104
105                         /* Do not check for IFF_ALLMULTI; multicast routing
106                            is not supported in any case.
107                          */
108                         if (newskb)
109                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
110                                         newskb, NULL, newskb->dev,
111                                         dev_loopback_xmit);
112
113                         if (ipv6_hdr(skb)->hop_limit == 0) {
114                                 IP6_INC_STATS(dev_net(dev), idev,
115                                               IPSTATS_MIB_OUTDISCARDS);
116                                 kfree_skb(skb);
117                                 return 0;
118                         }
119                 }
120
121                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
122                                 skb->len);
123         }
124
125         rcu_read_lock_bh();
126         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
127         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
128         if (unlikely(!neigh))
129                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
130         if (!IS_ERR(neigh)) {
131                 ret = dst_neigh_output(dst, neigh, skb);
132                 rcu_read_unlock_bh();
133                 return ret;
134         }
135         rcu_read_unlock_bh();
136
137         IP6_INC_STATS_BH(dev_net(dst->dev),
138                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
139         kfree_skb(skb);
140         return -EINVAL;
141 }
142
143 static int ip6_finish_output(struct sk_buff *skb)
144 {
145         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
146             dst_allfrag(skb_dst(skb)))
147                 return ip6_fragment(skb, ip6_finish_output2);
148         else
149                 return ip6_finish_output2(skb);
150 }
151
152 int ip6_output(struct sk_buff *skb)
153 {
154         struct net_device *dev = skb_dst(skb)->dev;
155         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
156         if (unlikely(idev->cnf.disable_ipv6)) {
157                 IP6_INC_STATS(dev_net(dev), idev,
158                               IPSTATS_MIB_OUTDISCARDS);
159                 kfree_skb(skb);
160                 return 0;
161         }
162
163         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
164                             ip6_finish_output,
165                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
166 }
167
168 /*
169  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
170  */
171
172 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
173              struct ipv6_txoptions *opt, int tclass)
174 {
175         struct net *net = sock_net(sk);
176         struct ipv6_pinfo *np = inet6_sk(sk);
177         struct in6_addr *first_hop = &fl6->daddr;
178         struct dst_entry *dst = skb_dst(skb);
179         struct ipv6hdr *hdr;
180         u8  proto = fl6->flowi6_proto;
181         int seg_len = skb->len;
182         int hlimit = -1;
183         u32 mtu;
184
185         if (opt) {
186                 unsigned int head_room;
187
188                 /* First: exthdrs may take lots of space (~8K for now)
189                    MAX_HEADER is not enough.
190                  */
191                 head_room = opt->opt_nflen + opt->opt_flen;
192                 seg_len += head_room;
193                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
194
195                 if (skb_headroom(skb) < head_room) {
196                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
197                         if (skb2 == NULL) {
198                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
199                                               IPSTATS_MIB_OUTDISCARDS);
200                                 kfree_skb(skb);
201                                 return -ENOBUFS;
202                         }
203                         consume_skb(skb);
204                         skb = skb2;
205                         skb_set_owner_w(skb, sk);
206                 }
207                 if (opt->opt_flen)
208                         ipv6_push_frag_opts(skb, opt, &proto);
209                 if (opt->opt_nflen)
210                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
211         }
212
213         skb_push(skb, sizeof(struct ipv6hdr));
214         skb_reset_network_header(skb);
215         hdr = ipv6_hdr(skb);
216
217         /*
218          *      Fill in the IPv6 header
219          */
220         if (np)
221                 hlimit = np->hop_limit;
222         if (hlimit < 0)
223                 hlimit = ip6_dst_hoplimit(dst);
224
225         ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
226
227         hdr->payload_len = htons(seg_len);
228         hdr->nexthdr = proto;
229         hdr->hop_limit = hlimit;
230
231         hdr->saddr = fl6->saddr;
232         hdr->daddr = *first_hop;
233
234         skb->priority = sk->sk_priority;
235         skb->mark = sk->sk_mark;
236
237         mtu = dst_mtu(dst);
238         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
239                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
240                               IPSTATS_MIB_OUT, skb->len);
241                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
242                                dst->dev, dst_output);
243         }
244
245         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
246         skb->dev = dst->dev;
247         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
248         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
249         kfree_skb(skb);
250         return -EMSGSIZE;
251 }
252
253 EXPORT_SYMBOL(ip6_xmit);
254
255 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
256 {
257         struct ip6_ra_chain *ra;
258         struct sock *last = NULL;
259
260         read_lock(&ip6_ra_lock);
261         for (ra = ip6_ra_chain; ra; ra = ra->next) {
262                 struct sock *sk = ra->sk;
263                 if (sk && ra->sel == sel &&
264                     (!sk->sk_bound_dev_if ||
265                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
266                         if (last) {
267                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
268                                 if (skb2)
269                                         rawv6_rcv(last, skb2);
270                         }
271                         last = sk;
272                 }
273         }
274
275         if (last) {
276                 rawv6_rcv(last, skb);
277                 read_unlock(&ip6_ra_lock);
278                 return 1;
279         }
280         read_unlock(&ip6_ra_lock);
281         return 0;
282 }
283
284 static int ip6_forward_proxy_check(struct sk_buff *skb)
285 {
286         struct ipv6hdr *hdr = ipv6_hdr(skb);
287         u8 nexthdr = hdr->nexthdr;
288         __be16 frag_off;
289         int offset;
290
291         if (ipv6_ext_hdr(nexthdr)) {
292                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
293                 if (offset < 0)
294                         return 0;
295         } else
296                 offset = sizeof(struct ipv6hdr);
297
298         if (nexthdr == IPPROTO_ICMPV6) {
299                 struct icmp6hdr *icmp6;
300
301                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
302                                          offset + 1 - skb->data)))
303                         return 0;
304
305                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
306
307                 switch (icmp6->icmp6_type) {
308                 case NDISC_ROUTER_SOLICITATION:
309                 case NDISC_ROUTER_ADVERTISEMENT:
310                 case NDISC_NEIGHBOUR_SOLICITATION:
311                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
312                 case NDISC_REDIRECT:
313                         /* For reaction involving unicast neighbor discovery
314                          * message destined to the proxied address, pass it to
315                          * input function.
316                          */
317                         return 1;
318                 default:
319                         break;
320                 }
321         }
322
323         /*
324          * The proxying router can't forward traffic sent to a link-local
325          * address, so signal the sender and discard the packet. This
326          * behavior is clarified by the MIPv6 specification.
327          */
328         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
329                 dst_link_failure(skb);
330                 return -1;
331         }
332
333         return 0;
334 }
335
336 static inline int ip6_forward_finish(struct sk_buff *skb)
337 {
338         return dst_output(skb);
339 }
340
341 int ip6_forward(struct sk_buff *skb)
342 {
343         struct dst_entry *dst = skb_dst(skb);
344         struct ipv6hdr *hdr = ipv6_hdr(skb);
345         struct inet6_skb_parm *opt = IP6CB(skb);
346         struct net *net = dev_net(dst->dev);
347         u32 mtu;
348
349         if (net->ipv6.devconf_all->forwarding == 0)
350                 goto error;
351
352         if (skb_warn_if_lro(skb))
353                 goto drop;
354
355         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
356                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
357                 goto drop;
358         }
359
360         if (skb->pkt_type != PACKET_HOST)
361                 goto drop;
362
363         skb_forward_csum(skb);
364
365         /*
366          *      We DO NOT make any processing on
367          *      RA packets, pushing them to user level AS IS
368          *      without ane WARRANTY that application will be able
369          *      to interpret them. The reason is that we
370          *      cannot make anything clever here.
371          *
372          *      We are not end-node, so that if packet contains
373          *      AH/ESP, we cannot make anything.
374          *      Defragmentation also would be mistake, RA packets
375          *      cannot be fragmented, because there is no warranty
376          *      that different fragments will go along one path. --ANK
377          */
378         if (opt->ra) {
379                 u8 *ptr = skb_network_header(skb) + opt->ra;
380                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
381                         return 0;
382         }
383
384         /*
385          *      check and decrement ttl
386          */
387         if (hdr->hop_limit <= 1) {
388                 /* Force OUTPUT device used as source address */
389                 skb->dev = dst->dev;
390                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
391                 IP6_INC_STATS_BH(net,
392                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
393
394                 kfree_skb(skb);
395                 return -ETIMEDOUT;
396         }
397
398         /* XXX: idev->cnf.proxy_ndp? */
399         if (net->ipv6.devconf_all->proxy_ndp &&
400             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
401                 int proxied = ip6_forward_proxy_check(skb);
402                 if (proxied > 0)
403                         return ip6_input(skb);
404                 else if (proxied < 0) {
405                         IP6_INC_STATS(net, ip6_dst_idev(dst),
406                                       IPSTATS_MIB_INDISCARDS);
407                         goto drop;
408                 }
409         }
410
411         if (!xfrm6_route_forward(skb)) {
412                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
413                 goto drop;
414         }
415         dst = skb_dst(skb);
416
417         /* IPv6 specs say nothing about it, but it is clear that we cannot
418            send redirects to source routed frames.
419            We don't send redirects to frames decapsulated from IPsec.
420          */
421         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
422                 struct in6_addr *target = NULL;
423                 struct inet_peer *peer;
424                 struct rt6_info *rt;
425
426                 /*
427                  *      incoming and outgoing devices are the same
428                  *      send a redirect.
429                  */
430
431                 rt = (struct rt6_info *) dst;
432                 if (rt->rt6i_flags & RTF_GATEWAY)
433                         target = &rt->rt6i_gateway;
434                 else
435                         target = &hdr->daddr;
436
437                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
438
439                 /* Limit redirects both by destination (here)
440                    and by source (inside ndisc_send_redirect)
441                  */
442                 if (inet_peer_xrlim_allow(peer, 1*HZ))
443                         ndisc_send_redirect(skb, target);
444                 if (peer)
445                         inet_putpeer(peer);
446         } else {
447                 int addrtype = ipv6_addr_type(&hdr->saddr);
448
449                 /* This check is security critical. */
450                 if (addrtype == IPV6_ADDR_ANY ||
451                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
452                         goto error;
453                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
454                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
455                                     ICMPV6_NOT_NEIGHBOUR, 0);
456                         goto error;
457                 }
458         }
459
460         mtu = dst_mtu(dst);
461         if (mtu < IPV6_MIN_MTU)
462                 mtu = IPV6_MIN_MTU;
463
464         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
465             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
466                 /* Again, force OUTPUT device used as source address */
467                 skb->dev = dst->dev;
468                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
469                 IP6_INC_STATS_BH(net,
470                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
471                 IP6_INC_STATS_BH(net,
472                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
473                 kfree_skb(skb);
474                 return -EMSGSIZE;
475         }
476
477         if (skb_cow(skb, dst->dev->hard_header_len)) {
478                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
479                 goto drop;
480         }
481
482         hdr = ipv6_hdr(skb);
483
484         /* Mangling hops number delayed to point after skb COW */
485
486         hdr->hop_limit--;
487
488         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
489         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
490         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
491                        ip6_forward_finish);
492
493 error:
494         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
495 drop:
496         kfree_skb(skb);
497         return -EINVAL;
498 }
499
500 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
501 {
502         to->pkt_type = from->pkt_type;
503         to->priority = from->priority;
504         to->protocol = from->protocol;
505         skb_dst_drop(to);
506         skb_dst_set(to, dst_clone(skb_dst(from)));
507         to->dev = from->dev;
508         to->mark = from->mark;
509
510 #ifdef CONFIG_NET_SCHED
511         to->tc_index = from->tc_index;
512 #endif
513         nf_copy(to, from);
514 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
515         to->nf_trace = from->nf_trace;
516 #endif
517         skb_copy_secmark(to, from);
518 }
519
520 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
521 {
522         struct sk_buff *frag;
523         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
524         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
525         struct ipv6hdr *tmp_hdr;
526         struct frag_hdr *fh;
527         unsigned int mtu, hlen, left, len;
528         int hroom, troom;
529         __be32 frag_id = 0;
530         int ptr, offset = 0, err=0;
531         u8 *prevhdr, nexthdr = 0;
532         struct net *net = dev_net(skb_dst(skb)->dev);
533
534         hlen = ip6_find_1stfragopt(skb, &prevhdr);
535         nexthdr = *prevhdr;
536
537         mtu = ip6_skb_dst_mtu(skb);
538
539         /* We must not fragment if the socket is set to force MTU discovery
540          * or if the skb it not generated by a local socket.
541          */
542         if (unlikely(!skb->local_df && skb->len > mtu) ||
543                      (IP6CB(skb)->frag_max_size &&
544                       IP6CB(skb)->frag_max_size > mtu)) {
545                 if (skb->sk && dst_allfrag(skb_dst(skb)))
546                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
547
548                 skb->dev = skb_dst(skb)->dev;
549                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
550                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
551                               IPSTATS_MIB_FRAGFAILS);
552                 kfree_skb(skb);
553                 return -EMSGSIZE;
554         }
555
556         if (np && np->frag_size < mtu) {
557                 if (np->frag_size)
558                         mtu = np->frag_size;
559         }
560         mtu -= hlen + sizeof(struct frag_hdr);
561
562         if (skb_has_frag_list(skb)) {
563                 int first_len = skb_pagelen(skb);
564                 struct sk_buff *frag2;
565
566                 if (first_len - hlen > mtu ||
567                     ((first_len - hlen) & 7) ||
568                     skb_cloned(skb))
569                         goto slow_path;
570
571                 skb_walk_frags(skb, frag) {
572                         /* Correct geometry. */
573                         if (frag->len > mtu ||
574                             ((frag->len & 7) && frag->next) ||
575                             skb_headroom(frag) < hlen)
576                                 goto slow_path_clean;
577
578                         /* Partially cloned skb? */
579                         if (skb_shared(frag))
580                                 goto slow_path_clean;
581
582                         BUG_ON(frag->sk);
583                         if (skb->sk) {
584                                 frag->sk = skb->sk;
585                                 frag->destructor = sock_wfree;
586                         }
587                         skb->truesize -= frag->truesize;
588                 }
589
590                 err = 0;
591                 offset = 0;
592                 frag = skb_shinfo(skb)->frag_list;
593                 skb_frag_list_init(skb);
594                 /* BUILD HEADER */
595
596                 *prevhdr = NEXTHDR_FRAGMENT;
597                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
598                 if (!tmp_hdr) {
599                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
600                                       IPSTATS_MIB_FRAGFAILS);
601                         return -ENOMEM;
602                 }
603
604                 __skb_pull(skb, hlen);
605                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
606                 __skb_push(skb, hlen);
607                 skb_reset_network_header(skb);
608                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
609
610                 ipv6_select_ident(fh, rt);
611                 fh->nexthdr = nexthdr;
612                 fh->reserved = 0;
613                 fh->frag_off = htons(IP6_MF);
614                 frag_id = fh->identification;
615
616                 first_len = skb_pagelen(skb);
617                 skb->data_len = first_len - skb_headlen(skb);
618                 skb->len = first_len;
619                 ipv6_hdr(skb)->payload_len = htons(first_len -
620                                                    sizeof(struct ipv6hdr));
621
622                 dst_hold(&rt->dst);
623
624                 for (;;) {
625                         /* Prepare header of the next frame,
626                          * before previous one went down. */
627                         if (frag) {
628                                 frag->ip_summed = CHECKSUM_NONE;
629                                 skb_reset_transport_header(frag);
630                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
631                                 __skb_push(frag, hlen);
632                                 skb_reset_network_header(frag);
633                                 memcpy(skb_network_header(frag), tmp_hdr,
634                                        hlen);
635                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
636                                 fh->nexthdr = nexthdr;
637                                 fh->reserved = 0;
638                                 fh->frag_off = htons(offset);
639                                 if (frag->next != NULL)
640                                         fh->frag_off |= htons(IP6_MF);
641                                 fh->identification = frag_id;
642                                 ipv6_hdr(frag)->payload_len =
643                                                 htons(frag->len -
644                                                       sizeof(struct ipv6hdr));
645                                 ip6_copy_metadata(frag, skb);
646                         }
647
648                         err = output(skb);
649                         if(!err)
650                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
651                                               IPSTATS_MIB_FRAGCREATES);
652
653                         if (err || !frag)
654                                 break;
655
656                         skb = frag;
657                         frag = skb->next;
658                         skb->next = NULL;
659                 }
660
661                 kfree(tmp_hdr);
662
663                 if (err == 0) {
664                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
665                                       IPSTATS_MIB_FRAGOKS);
666                         ip6_rt_put(rt);
667                         return 0;
668                 }
669
670                 while (frag) {
671                         skb = frag->next;
672                         kfree_skb(frag);
673                         frag = skb;
674                 }
675
676                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
677                               IPSTATS_MIB_FRAGFAILS);
678                 ip6_rt_put(rt);
679                 return err;
680
681 slow_path_clean:
682                 skb_walk_frags(skb, frag2) {
683                         if (frag2 == frag)
684                                 break;
685                         frag2->sk = NULL;
686                         frag2->destructor = NULL;
687                         skb->truesize += frag2->truesize;
688                 }
689         }
690
691 slow_path:
692         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
693             skb_checksum_help(skb))
694                 goto fail;
695
696         left = skb->len - hlen;         /* Space per frame */
697         ptr = hlen;                     /* Where to start from */
698
699         /*
700          *      Fragment the datagram.
701          */
702
703         *prevhdr = NEXTHDR_FRAGMENT;
704         hroom = LL_RESERVED_SPACE(rt->dst.dev);
705         troom = rt->dst.dev->needed_tailroom;
706
707         /*
708          *      Keep copying data until we run out.
709          */
710         while(left > 0) {
711                 len = left;
712                 /* IF: it doesn't fit, use 'mtu' - the data space left */
713                 if (len > mtu)
714                         len = mtu;
715                 /* IF: we are not sending up to and including the packet end
716                    then align the next start on an eight byte boundary */
717                 if (len < left) {
718                         len &= ~7;
719                 }
720                 /*
721                  *      Allocate buffer.
722                  */
723
724                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
725                                       hroom + troom, GFP_ATOMIC)) == NULL) {
726                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
727                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
728                                       IPSTATS_MIB_FRAGFAILS);
729                         err = -ENOMEM;
730                         goto fail;
731                 }
732
733                 /*
734                  *      Set up data on packet
735                  */
736
737                 ip6_copy_metadata(frag, skb);
738                 skb_reserve(frag, hroom);
739                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
740                 skb_reset_network_header(frag);
741                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
742                 frag->transport_header = (frag->network_header + hlen +
743                                           sizeof(struct frag_hdr));
744
745                 /*
746                  *      Charge the memory for the fragment to any owner
747                  *      it might possess
748                  */
749                 if (skb->sk)
750                         skb_set_owner_w(frag, skb->sk);
751
752                 /*
753                  *      Copy the packet header into the new buffer.
754                  */
755                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
756
757                 /*
758                  *      Build fragment header.
759                  */
760                 fh->nexthdr = nexthdr;
761                 fh->reserved = 0;
762                 if (!frag_id) {
763                         ipv6_select_ident(fh, rt);
764                         frag_id = fh->identification;
765                 } else
766                         fh->identification = frag_id;
767
768                 /*
769                  *      Copy a block of the IP datagram.
770                  */
771                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
772                         BUG();
773                 left -= len;
774
775                 fh->frag_off = htons(offset);
776                 if (left > 0)
777                         fh->frag_off |= htons(IP6_MF);
778                 ipv6_hdr(frag)->payload_len = htons(frag->len -
779                                                     sizeof(struct ipv6hdr));
780
781                 ptr += len;
782                 offset += len;
783
784                 /*
785                  *      Put this fragment into the sending queue.
786                  */
787                 err = output(frag);
788                 if (err)
789                         goto fail;
790
791                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
792                               IPSTATS_MIB_FRAGCREATES);
793         }
794         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
795                       IPSTATS_MIB_FRAGOKS);
796         consume_skb(skb);
797         return err;
798
799 fail:
800         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
801                       IPSTATS_MIB_FRAGFAILS);
802         kfree_skb(skb);
803         return err;
804 }
805
806 static inline int ip6_rt_check(const struct rt6key *rt_key,
807                                const struct in6_addr *fl_addr,
808                                const struct in6_addr *addr_cache)
809 {
810         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
811                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
812 }
813
814 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
815                                           struct dst_entry *dst,
816                                           const struct flowi6 *fl6)
817 {
818         struct ipv6_pinfo *np = inet6_sk(sk);
819         struct rt6_info *rt = (struct rt6_info *)dst;
820
821         if (!dst)
822                 goto out;
823
824         /* Yes, checking route validity in not connected
825          * case is not very simple. Take into account,
826          * that we do not support routing by source, TOS,
827          * and MSG_DONTROUTE            --ANK (980726)
828          *
829          * 1. ip6_rt_check(): If route was host route,
830          *    check that cached destination is current.
831          *    If it is network route, we still may
832          *    check its validity using saved pointer
833          *    to the last used address: daddr_cache.
834          *    We do not want to save whole address now,
835          *    (because main consumer of this service
836          *    is tcp, which has not this problem),
837          *    so that the last trick works only on connected
838          *    sockets.
839          * 2. oif also should be the same.
840          */
841         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
842 #ifdef CONFIG_IPV6_SUBTREES
843             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
844 #endif
845             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
846                 dst_release(dst);
847                 dst = NULL;
848         }
849
850 out:
851         return dst;
852 }
853
854 static int ip6_dst_lookup_tail(struct sock *sk,
855                                struct dst_entry **dst, struct flowi6 *fl6)
856 {
857         struct net *net = sock_net(sk);
858 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
859         struct neighbour *n;
860         struct rt6_info *rt;
861 #endif
862         int err;
863
864         if (*dst == NULL)
865                 *dst = ip6_route_output(net, sk, fl6);
866
867         if ((err = (*dst)->error))
868                 goto out_err_release;
869
870         if (ipv6_addr_any(&fl6->saddr)) {
871                 struct rt6_info *rt = (struct rt6_info *) *dst;
872                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
873                                           sk ? inet6_sk(sk)->srcprefs : 0,
874                                           &fl6->saddr);
875                 if (err)
876                         goto out_err_release;
877         }
878
879 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
880         /*
881          * Here if the dst entry we've looked up
882          * has a neighbour entry that is in the INCOMPLETE
883          * state and the src address from the flow is
884          * marked as OPTIMISTIC, we release the found
885          * dst entry and replace it instead with the
886          * dst entry of the nexthop router
887          */
888         rt = (struct rt6_info *) *dst;
889         rcu_read_lock_bh();
890         n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
891         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
892         rcu_read_unlock_bh();
893
894         if (err) {
895                 struct inet6_ifaddr *ifp;
896                 struct flowi6 fl_gw6;
897                 int redirect;
898
899                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
900                                       (*dst)->dev, 1);
901
902                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
903                 if (ifp)
904                         in6_ifa_put(ifp);
905
906                 if (redirect) {
907                         /*
908                          * We need to get the dst entry for the
909                          * default router instead
910                          */
911                         dst_release(*dst);
912                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
913                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
914                         *dst = ip6_route_output(net, sk, &fl_gw6);
915                         if ((err = (*dst)->error))
916                                 goto out_err_release;
917                 }
918         }
919 #endif
920
921         return 0;
922
923 out_err_release:
924         if (err == -ENETUNREACH)
925                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
926         dst_release(*dst);
927         *dst = NULL;
928         return err;
929 }
930
931 /**
932  *      ip6_dst_lookup - perform route lookup on flow
933  *      @sk: socket which provides route info
934  *      @dst: pointer to dst_entry * for result
935  *      @fl6: flow to lookup
936  *
937  *      This function performs a route lookup on the given flow.
938  *
939  *      It returns zero on success, or a standard errno code on error.
940  */
941 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
942 {
943         *dst = NULL;
944         return ip6_dst_lookup_tail(sk, dst, fl6);
945 }
946 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
947
948 /**
949  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
950  *      @sk: socket which provides route info
951  *      @fl6: flow to lookup
952  *      @final_dst: final destination address for ipsec lookup
953  *      @can_sleep: we are in a sleepable context
954  *
955  *      This function performs a route lookup on the given flow.
956  *
957  *      It returns a valid dst pointer on success, or a pointer encoded
958  *      error code.
959  */
960 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
961                                       const struct in6_addr *final_dst,
962                                       bool can_sleep)
963 {
964         struct dst_entry *dst = NULL;
965         int err;
966
967         err = ip6_dst_lookup_tail(sk, &dst, fl6);
968         if (err)
969                 return ERR_PTR(err);
970         if (final_dst)
971                 fl6->daddr = *final_dst;
972         if (can_sleep)
973                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
974
975         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
976 }
977 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
978
979 /**
980  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
981  *      @sk: socket which provides the dst cache and route info
982  *      @fl6: flow to lookup
983  *      @final_dst: final destination address for ipsec lookup
984  *      @can_sleep: we are in a sleepable context
985  *
986  *      This function performs a route lookup on the given flow with the
987  *      possibility of using the cached route in the socket if it is valid.
988  *      It will take the socket dst lock when operating on the dst cache.
989  *      As a result, this function can only be used in process context.
990  *
991  *      It returns a valid dst pointer on success, or a pointer encoded
992  *      error code.
993  */
994 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
995                                          const struct in6_addr *final_dst,
996                                          bool can_sleep)
997 {
998         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
999         int err;
1000
1001         dst = ip6_sk_dst_check(sk, dst, fl6);
1002
1003         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1004         if (err)
1005                 return ERR_PTR(err);
1006         if (final_dst)
1007                 fl6->daddr = *final_dst;
1008         if (can_sleep)
1009                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1010
1011         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1012 }
1013 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1014
1015 static inline int ip6_ufo_append_data(struct sock *sk,
1016                         int getfrag(void *from, char *to, int offset, int len,
1017                         int odd, struct sk_buff *skb),
1018                         void *from, int length, int hh_len, int fragheaderlen,
1019                         int transhdrlen, int mtu,unsigned int flags,
1020                         struct rt6_info *rt)
1021
1022 {
1023         struct sk_buff *skb;
1024         int err;
1025
1026         /* There is support for UDP large send offload by network
1027          * device, so create one single skb packet containing complete
1028          * udp datagram
1029          */
1030         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1031                 skb = sock_alloc_send_skb(sk,
1032                         hh_len + fragheaderlen + transhdrlen + 20,
1033                         (flags & MSG_DONTWAIT), &err);
1034                 if (skb == NULL)
1035                         return err;
1036
1037                 /* reserve space for Hardware header */
1038                 skb_reserve(skb, hh_len);
1039
1040                 /* create space for UDP/IP header */
1041                 skb_put(skb,fragheaderlen + transhdrlen);
1042
1043                 /* initialize network header pointer */
1044                 skb_reset_network_header(skb);
1045
1046                 /* initialize protocol header pointer */
1047                 skb->transport_header = skb->network_header + fragheaderlen;
1048
1049                 skb->ip_summed = CHECKSUM_PARTIAL;
1050                 skb->csum = 0;
1051         }
1052
1053         err = skb_append_datato_frags(sk,skb, getfrag, from,
1054                                       (length - transhdrlen));
1055         if (!err) {
1056                 struct frag_hdr fhdr;
1057
1058                 /* Specify the length of each IPv6 datagram fragment.
1059                  * It has to be a multiple of 8.
1060                  */
1061                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1062                                              sizeof(struct frag_hdr)) & ~7;
1063                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1064                 ipv6_select_ident(&fhdr, rt);
1065                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1066                 __skb_queue_tail(&sk->sk_write_queue, skb);
1067
1068                 return 0;
1069         }
1070         /* There is not enough support do UPD LSO,
1071          * so follow normal path
1072          */
1073         kfree_skb(skb);
1074
1075         return err;
1076 }
1077
1078 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1079                                                gfp_t gfp)
1080 {
1081         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1082 }
1083
1084 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1085                                                 gfp_t gfp)
1086 {
1087         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1088 }
1089
1090 static void ip6_append_data_mtu(int *mtu,
1091                                 int *maxfraglen,
1092                                 unsigned int fragheaderlen,
1093                                 struct sk_buff *skb,
1094                                 struct rt6_info *rt)
1095 {
1096         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1097                 if (skb == NULL) {
1098                         /* first fragment, reserve header_len */
1099                         *mtu = *mtu - rt->dst.header_len;
1100
1101                 } else {
1102                         /*
1103                          * this fragment is not first, the headers
1104                          * space is regarded as data space.
1105                          */
1106                         *mtu = dst_mtu(rt->dst.path);
1107                 }
1108                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1109                               + fragheaderlen - sizeof(struct frag_hdr);
1110         }
1111 }
1112
1113 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1114         int offset, int len, int odd, struct sk_buff *skb),
1115         void *from, int length, int transhdrlen,
1116         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1117         struct rt6_info *rt, unsigned int flags, int dontfrag)
1118 {
1119         struct inet_sock *inet = inet_sk(sk);
1120         struct ipv6_pinfo *np = inet6_sk(sk);
1121         struct inet_cork *cork;
1122         struct sk_buff *skb, *skb_prev = NULL;
1123         unsigned int maxfraglen, fragheaderlen;
1124         int exthdrlen;
1125         int dst_exthdrlen;
1126         int hh_len;
1127         int mtu;
1128         int copy;
1129         int err;
1130         int offset = 0;
1131         __u8 tx_flags = 0;
1132
1133         if (flags&MSG_PROBE)
1134                 return 0;
1135         cork = &inet->cork.base;
1136         if (skb_queue_empty(&sk->sk_write_queue)) {
1137                 /*
1138                  * setup for corking
1139                  */
1140                 if (opt) {
1141                         if (WARN_ON(np->cork.opt))
1142                                 return -EINVAL;
1143
1144                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1145                         if (unlikely(np->cork.opt == NULL))
1146                                 return -ENOBUFS;
1147
1148                         np->cork.opt->tot_len = opt->tot_len;
1149                         np->cork.opt->opt_flen = opt->opt_flen;
1150                         np->cork.opt->opt_nflen = opt->opt_nflen;
1151
1152                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1153                                                             sk->sk_allocation);
1154                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1155                                 return -ENOBUFS;
1156
1157                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1158                                                             sk->sk_allocation);
1159                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1160                                 return -ENOBUFS;
1161
1162                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1163                                                            sk->sk_allocation);
1164                         if (opt->hopopt && !np->cork.opt->hopopt)
1165                                 return -ENOBUFS;
1166
1167                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1168                                                             sk->sk_allocation);
1169                         if (opt->srcrt && !np->cork.opt->srcrt)
1170                                 return -ENOBUFS;
1171
1172                         /* need source address above miyazawa*/
1173                 }
1174                 dst_hold(&rt->dst);
1175                 cork->dst = &rt->dst;
1176                 inet->cork.fl.u.ip6 = *fl6;
1177                 np->cork.hop_limit = hlimit;
1178                 np->cork.tclass = tclass;
1179                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1180                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1181                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1182                 else
1183                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1184                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1185                 if (np->frag_size < mtu) {
1186                         if (np->frag_size)
1187                                 mtu = np->frag_size;
1188                 }
1189                 cork->fragsize = mtu;
1190                 if (dst_allfrag(rt->dst.path))
1191                         cork->flags |= IPCORK_ALLFRAG;
1192                 cork->length = 0;
1193                 exthdrlen = (opt ? opt->opt_flen : 0);
1194                 length += exthdrlen;
1195                 transhdrlen += exthdrlen;
1196                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1197         } else {
1198                 rt = (struct rt6_info *)cork->dst;
1199                 fl6 = &inet->cork.fl.u.ip6;
1200                 opt = np->cork.opt;
1201                 transhdrlen = 0;
1202                 exthdrlen = 0;
1203                 dst_exthdrlen = 0;
1204                 mtu = cork->fragsize;
1205         }
1206
1207         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1208
1209         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1210                         (opt ? opt->opt_nflen : 0);
1211         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1212
1213         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1214                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1215                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1216                         return -EMSGSIZE;
1217                 }
1218         }
1219
1220         /* For UDP, check if TX timestamp is enabled */
1221         if (sk->sk_type == SOCK_DGRAM) {
1222                 err = sock_tx_timestamp(sk, &tx_flags);
1223                 if (err)
1224                         goto error;
1225         }
1226
1227         /*
1228          * Let's try using as much space as possible.
1229          * Use MTU if total length of the message fits into the MTU.
1230          * Otherwise, we need to reserve fragment header and
1231          * fragment alignment (= 8-15 octects, in total).
1232          *
1233          * Note that we may need to "move" the data from the tail of
1234          * of the buffer to the new fragment when we split
1235          * the message.
1236          *
1237          * FIXME: It may be fragmented into multiple chunks
1238          *        at once if non-fragmentable extension headers
1239          *        are too large.
1240          * --yoshfuji
1241          */
1242
1243         cork->length += length;
1244         if (length > mtu) {
1245                 int proto = sk->sk_protocol;
1246                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1247                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1248                         return -EMSGSIZE;
1249                 }
1250
1251                 if (proto == IPPROTO_UDP &&
1252                     (rt->dst.dev->features & NETIF_F_UFO)) {
1253
1254                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1255                                                   hh_len, fragheaderlen,
1256                                                   transhdrlen, mtu, flags, rt);
1257                         if (err)
1258                                 goto error;
1259                         return 0;
1260                 }
1261         }
1262
1263         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1264                 goto alloc_new_skb;
1265
1266         while (length > 0) {
1267                 /* Check if the remaining data fits into current packet. */
1268                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1269                 if (copy < length)
1270                         copy = maxfraglen - skb->len;
1271
1272                 if (copy <= 0) {
1273                         char *data;
1274                         unsigned int datalen;
1275                         unsigned int fraglen;
1276                         unsigned int fraggap;
1277                         unsigned int alloclen;
1278 alloc_new_skb:
1279                         /* There's no room in the current skb */
1280                         if (skb)
1281                                 fraggap = skb->len - maxfraglen;
1282                         else
1283                                 fraggap = 0;
1284                         /* update mtu and maxfraglen if necessary */
1285                         if (skb == NULL || skb_prev == NULL)
1286                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1287                                                     fragheaderlen, skb, rt);
1288
1289                         skb_prev = skb;
1290
1291                         /*
1292                          * If remaining data exceeds the mtu,
1293                          * we know we need more fragment(s).
1294                          */
1295                         datalen = length + fraggap;
1296
1297                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1298                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1299                         if ((flags & MSG_MORE) &&
1300                             !(rt->dst.dev->features&NETIF_F_SG))
1301                                 alloclen = mtu;
1302                         else
1303                                 alloclen = datalen + fragheaderlen;
1304
1305                         alloclen += dst_exthdrlen;
1306
1307                         if (datalen != length + fraggap) {
1308                                 /*
1309                                  * this is not the last fragment, the trailer
1310                                  * space is regarded as data space.
1311                                  */
1312                                 datalen += rt->dst.trailer_len;
1313                         }
1314
1315                         alloclen += rt->dst.trailer_len;
1316                         fraglen = datalen + fragheaderlen;
1317
1318                         /*
1319                          * We just reserve space for fragment header.
1320                          * Note: this may be overallocation if the message
1321                          * (without MSG_MORE) fits into the MTU.
1322                          */
1323                         alloclen += sizeof(struct frag_hdr);
1324
1325                         if (transhdrlen) {
1326                                 skb = sock_alloc_send_skb(sk,
1327                                                 alloclen + hh_len,
1328                                                 (flags & MSG_DONTWAIT), &err);
1329                         } else {
1330                                 skb = NULL;
1331                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1332                                     2 * sk->sk_sndbuf)
1333                                         skb = sock_wmalloc(sk,
1334                                                            alloclen + hh_len, 1,
1335                                                            sk->sk_allocation);
1336                                 if (unlikely(skb == NULL))
1337                                         err = -ENOBUFS;
1338                                 else {
1339                                         /* Only the initial fragment
1340                                          * is time stamped.
1341                                          */
1342                                         tx_flags = 0;
1343                                 }
1344                         }
1345                         if (skb == NULL)
1346                                 goto error;
1347                         /*
1348                          *      Fill in the control structures
1349                          */
1350                         skb->ip_summed = CHECKSUM_NONE;
1351                         skb->csum = 0;
1352                         /* reserve for fragmentation and ipsec header */
1353                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1354                                     dst_exthdrlen);
1355
1356                         if (sk->sk_type == SOCK_DGRAM)
1357                                 skb_shinfo(skb)->tx_flags = tx_flags;
1358
1359                         /*
1360                          *      Find where to start putting bytes
1361                          */
1362                         data = skb_put(skb, fraglen);
1363                         skb_set_network_header(skb, exthdrlen);
1364                         data += fragheaderlen;
1365                         skb->transport_header = (skb->network_header +
1366                                                  fragheaderlen);
1367                         if (fraggap) {
1368                                 skb->csum = skb_copy_and_csum_bits(
1369                                         skb_prev, maxfraglen,
1370                                         data + transhdrlen, fraggap, 0);
1371                                 skb_prev->csum = csum_sub(skb_prev->csum,
1372                                                           skb->csum);
1373                                 data += fraggap;
1374                                 pskb_trim_unique(skb_prev, maxfraglen);
1375                         }
1376                         copy = datalen - transhdrlen - fraggap;
1377
1378                         if (copy < 0) {
1379                                 err = -EINVAL;
1380                                 kfree_skb(skb);
1381                                 goto error;
1382                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1383                                 err = -EFAULT;
1384                                 kfree_skb(skb);
1385                                 goto error;
1386                         }
1387
1388                         offset += copy;
1389                         length -= datalen - fraggap;
1390                         transhdrlen = 0;
1391                         exthdrlen = 0;
1392                         dst_exthdrlen = 0;
1393
1394                         /*
1395                          * Put the packet on the pending queue
1396                          */
1397                         __skb_queue_tail(&sk->sk_write_queue, skb);
1398                         continue;
1399                 }
1400
1401                 if (copy > length)
1402                         copy = length;
1403
1404                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1405                         unsigned int off;
1406
1407                         off = skb->len;
1408                         if (getfrag(from, skb_put(skb, copy),
1409                                                 offset, copy, off, skb) < 0) {
1410                                 __skb_trim(skb, off);
1411                                 err = -EFAULT;
1412                                 goto error;
1413                         }
1414                 } else {
1415                         int i = skb_shinfo(skb)->nr_frags;
1416                         struct page_frag *pfrag = sk_page_frag(sk);
1417
1418                         err = -ENOMEM;
1419                         if (!sk_page_frag_refill(sk, pfrag))
1420                                 goto error;
1421
1422                         if (!skb_can_coalesce(skb, i, pfrag->page,
1423                                               pfrag->offset)) {
1424                                 err = -EMSGSIZE;
1425                                 if (i == MAX_SKB_FRAGS)
1426                                         goto error;
1427
1428                                 __skb_fill_page_desc(skb, i, pfrag->page,
1429                                                      pfrag->offset, 0);
1430                                 skb_shinfo(skb)->nr_frags = ++i;
1431                                 get_page(pfrag->page);
1432                         }
1433                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1434                         if (getfrag(from,
1435                                     page_address(pfrag->page) + pfrag->offset,
1436                                     offset, copy, skb->len, skb) < 0)
1437                                 goto error_efault;
1438
1439                         pfrag->offset += copy;
1440                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1441                         skb->len += copy;
1442                         skb->data_len += copy;
1443                         skb->truesize += copy;
1444                         atomic_add(copy, &sk->sk_wmem_alloc);
1445                 }
1446                 offset += copy;
1447                 length -= copy;
1448         }
1449
1450         return 0;
1451
1452 error_efault:
1453         err = -EFAULT;
1454 error:
1455         cork->length -= length;
1456         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1457         return err;
1458 }
1459 EXPORT_SYMBOL_GPL(ip6_append_data);
1460
1461 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1462 {
1463         if (np->cork.opt) {
1464                 kfree(np->cork.opt->dst0opt);
1465                 kfree(np->cork.opt->dst1opt);
1466                 kfree(np->cork.opt->hopopt);
1467                 kfree(np->cork.opt->srcrt);
1468                 kfree(np->cork.opt);
1469                 np->cork.opt = NULL;
1470         }
1471
1472         if (inet->cork.base.dst) {
1473                 dst_release(inet->cork.base.dst);
1474                 inet->cork.base.dst = NULL;
1475                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1476         }
1477         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1478 }
1479
1480 int ip6_push_pending_frames(struct sock *sk)
1481 {
1482         struct sk_buff *skb, *tmp_skb;
1483         struct sk_buff **tail_skb;
1484         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1485         struct inet_sock *inet = inet_sk(sk);
1486         struct ipv6_pinfo *np = inet6_sk(sk);
1487         struct net *net = sock_net(sk);
1488         struct ipv6hdr *hdr;
1489         struct ipv6_txoptions *opt = np->cork.opt;
1490         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1491         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1492         unsigned char proto = fl6->flowi6_proto;
1493         int err = 0;
1494
1495         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1496                 goto out;
1497         tail_skb = &(skb_shinfo(skb)->frag_list);
1498
1499         /* move skb->data to ip header from ext header */
1500         if (skb->data < skb_network_header(skb))
1501                 __skb_pull(skb, skb_network_offset(skb));
1502         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1503                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1504                 *tail_skb = tmp_skb;
1505                 tail_skb = &(tmp_skb->next);
1506                 skb->len += tmp_skb->len;
1507                 skb->data_len += tmp_skb->len;
1508                 skb->truesize += tmp_skb->truesize;
1509                 tmp_skb->destructor = NULL;
1510                 tmp_skb->sk = NULL;
1511         }
1512
1513         /* Allow local fragmentation. */
1514         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1515                 skb->local_df = 1;
1516
1517         *final_dst = fl6->daddr;
1518         __skb_pull(skb, skb_network_header_len(skb));
1519         if (opt && opt->opt_flen)
1520                 ipv6_push_frag_opts(skb, opt, &proto);
1521         if (opt && opt->opt_nflen)
1522                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1523
1524         skb_push(skb, sizeof(struct ipv6hdr));
1525         skb_reset_network_header(skb);
1526         hdr = ipv6_hdr(skb);
1527
1528         ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1529         hdr->hop_limit = np->cork.hop_limit;
1530         hdr->nexthdr = proto;
1531         hdr->saddr = fl6->saddr;
1532         hdr->daddr = *final_dst;
1533
1534         skb->priority = sk->sk_priority;
1535         skb->mark = sk->sk_mark;
1536
1537         skb_dst_set(skb, dst_clone(&rt->dst));
1538         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1539         if (proto == IPPROTO_ICMPV6) {
1540                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1541
1542                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1543                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1544         }
1545
1546         err = ip6_local_out(skb);
1547         if (err) {
1548                 if (err > 0)
1549                         err = net_xmit_errno(err);
1550                 if (err)
1551                         goto error;
1552         }
1553
1554 out:
1555         ip6_cork_release(inet, np);
1556         return err;
1557 error:
1558         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1559         goto out;
1560 }
1561 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1562
1563 void ip6_flush_pending_frames(struct sock *sk)
1564 {
1565         struct sk_buff *skb;
1566
1567         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1568                 if (skb_dst(skb))
1569                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1570                                       IPSTATS_MIB_OUTDISCARDS);
1571                 kfree_skb(skb);
1572         }
1573
1574         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1575 }
1576 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);