]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv6/ip6_output.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[karo-tx-linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
59
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62         struct dst_entry *dst = skb_dst(skb);
63         struct net_device *dev = dst->dev;
64         struct neighbour *neigh;
65         struct in6_addr *nexthop;
66         int ret;
67
68         skb->protocol = htons(ETH_P_IPV6);
69         skb->dev = dev;
70
71         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
72                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73
74                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
75                     ((mroute6_socket(net, skb) &&
76                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
77                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
78                                          &ipv6_hdr(skb)->saddr))) {
79                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80
81                         /* Do not check for IFF_ALLMULTI; multicast routing
82                            is not supported in any case.
83                          */
84                         if (newskb)
85                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
86                                         net, sk, newskb, NULL, newskb->dev,
87                                         dev_loopback_xmit);
88
89                         if (ipv6_hdr(skb)->hop_limit == 0) {
90                                 IP6_INC_STATS(net, idev,
91                                               IPSTATS_MIB_OUTDISCARDS);
92                                 kfree_skb(skb);
93                                 return 0;
94                         }
95                 }
96
97                 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
98
99                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100                     IPV6_ADDR_SCOPE_NODELOCAL &&
101                     !(dev->flags & IFF_LOOPBACK)) {
102                         kfree_skb(skb);
103                         return 0;
104                 }
105         }
106
107         rcu_read_lock_bh();
108         nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110         if (unlikely(!neigh))
111                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112         if (!IS_ERR(neigh)) {
113                 ret = dst_neigh_output(dst, neigh, skb);
114                 rcu_read_unlock_bh();
115                 return ret;
116         }
117         rcu_read_unlock_bh();
118
119         IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
120         kfree_skb(skb);
121         return -EINVAL;
122 }
123
124 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
125 {
126         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
127             dst_allfrag(skb_dst(skb)) ||
128             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
129                 return ip6_fragment(net, sk, skb, ip6_finish_output2);
130         else
131                 return ip6_finish_output2(net, sk, skb);
132 }
133
134 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
135 {
136         struct net_device *dev = skb_dst(skb)->dev;
137         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
138
139         if (unlikely(idev->cnf.disable_ipv6)) {
140                 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
141                 kfree_skb(skb);
142                 return 0;
143         }
144
145         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
146                             net, sk, skb, NULL, dev,
147                             ip6_finish_output,
148                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149 }
150
151 /*
152  * xmit an sk_buff (used by TCP, SCTP and DCCP)
153  * Note : socket lock is not held for SYNACK packets, but might be modified
154  * by calls to skb_set_owner_w() and ipv6_local_error(),
155  * which are using proper atomic operations or spinlocks.
156  */
157 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
158              struct ipv6_txoptions *opt, int tclass)
159 {
160         struct net *net = sock_net(sk);
161         const struct ipv6_pinfo *np = inet6_sk(sk);
162         struct in6_addr *first_hop = &fl6->daddr;
163         struct dst_entry *dst = skb_dst(skb);
164         struct ipv6hdr *hdr;
165         u8  proto = fl6->flowi6_proto;
166         int seg_len = skb->len;
167         int hlimit = -1;
168         u32 mtu;
169
170         if (opt) {
171                 unsigned int head_room;
172
173                 /* First: exthdrs may take lots of space (~8K for now)
174                    MAX_HEADER is not enough.
175                  */
176                 head_room = opt->opt_nflen + opt->opt_flen;
177                 seg_len += head_room;
178                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
179
180                 if (skb_headroom(skb) < head_room) {
181                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
182                         if (!skb2) {
183                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
184                                               IPSTATS_MIB_OUTDISCARDS);
185                                 kfree_skb(skb);
186                                 return -ENOBUFS;
187                         }
188                         consume_skb(skb);
189                         skb = skb2;
190                         /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
191                          * it is safe to call in our context (socket lock not held)
192                          */
193                         skb_set_owner_w(skb, (struct sock *)sk);
194                 }
195                 if (opt->opt_flen)
196                         ipv6_push_frag_opts(skb, opt, &proto);
197                 if (opt->opt_nflen)
198                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
199         }
200
201         skb_push(skb, sizeof(struct ipv6hdr));
202         skb_reset_network_header(skb);
203         hdr = ipv6_hdr(skb);
204
205         /*
206          *      Fill in the IPv6 header
207          */
208         if (np)
209                 hlimit = np->hop_limit;
210         if (hlimit < 0)
211                 hlimit = ip6_dst_hoplimit(dst);
212
213         ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
214                                                      np->autoflowlabel, fl6));
215
216         hdr->payload_len = htons(seg_len);
217         hdr->nexthdr = proto;
218         hdr->hop_limit = hlimit;
219
220         hdr->saddr = fl6->saddr;
221         hdr->daddr = *first_hop;
222
223         skb->protocol = htons(ETH_P_IPV6);
224         skb->priority = sk->sk_priority;
225         skb->mark = sk->sk_mark;
226
227         mtu = dst_mtu(dst);
228         if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
229                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
230                               IPSTATS_MIB_OUT, skb->len);
231                 /* hooks should never assume socket lock is held.
232                  * we promote our socket to non const
233                  */
234                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
235                                net, (struct sock *)sk, skb, NULL, dst->dev,
236                                dst_output);
237         }
238
239         skb->dev = dst->dev;
240         /* ipv6_local_error() does not require socket lock,
241          * we promote our socket to non const
242          */
243         ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
244
245         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
246         kfree_skb(skb);
247         return -EMSGSIZE;
248 }
249 EXPORT_SYMBOL(ip6_xmit);
250
251 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
252 {
253         struct ip6_ra_chain *ra;
254         struct sock *last = NULL;
255
256         read_lock(&ip6_ra_lock);
257         for (ra = ip6_ra_chain; ra; ra = ra->next) {
258                 struct sock *sk = ra->sk;
259                 if (sk && ra->sel == sel &&
260                     (!sk->sk_bound_dev_if ||
261                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
262                         if (last) {
263                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
264                                 if (skb2)
265                                         rawv6_rcv(last, skb2);
266                         }
267                         last = sk;
268                 }
269         }
270
271         if (last) {
272                 rawv6_rcv(last, skb);
273                 read_unlock(&ip6_ra_lock);
274                 return 1;
275         }
276         read_unlock(&ip6_ra_lock);
277         return 0;
278 }
279
280 static int ip6_forward_proxy_check(struct sk_buff *skb)
281 {
282         struct ipv6hdr *hdr = ipv6_hdr(skb);
283         u8 nexthdr = hdr->nexthdr;
284         __be16 frag_off;
285         int offset;
286
287         if (ipv6_ext_hdr(nexthdr)) {
288                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
289                 if (offset < 0)
290                         return 0;
291         } else
292                 offset = sizeof(struct ipv6hdr);
293
294         if (nexthdr == IPPROTO_ICMPV6) {
295                 struct icmp6hdr *icmp6;
296
297                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
298                                          offset + 1 - skb->data)))
299                         return 0;
300
301                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
302
303                 switch (icmp6->icmp6_type) {
304                 case NDISC_ROUTER_SOLICITATION:
305                 case NDISC_ROUTER_ADVERTISEMENT:
306                 case NDISC_NEIGHBOUR_SOLICITATION:
307                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
308                 case NDISC_REDIRECT:
309                         /* For reaction involving unicast neighbor discovery
310                          * message destined to the proxied address, pass it to
311                          * input function.
312                          */
313                         return 1;
314                 default:
315                         break;
316                 }
317         }
318
319         /*
320          * The proxying router can't forward traffic sent to a link-local
321          * address, so signal the sender and discard the packet. This
322          * behavior is clarified by the MIPv6 specification.
323          */
324         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
325                 dst_link_failure(skb);
326                 return -1;
327         }
328
329         return 0;
330 }
331
332 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
333                                      struct sk_buff *skb)
334 {
335         skb_sender_cpu_clear(skb);
336         return dst_output(net, sk, skb);
337 }
338
339 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
340 {
341         unsigned int mtu;
342         struct inet6_dev *idev;
343
344         if (dst_metric_locked(dst, RTAX_MTU)) {
345                 mtu = dst_metric_raw(dst, RTAX_MTU);
346                 if (mtu)
347                         return mtu;
348         }
349
350         mtu = IPV6_MIN_MTU;
351         rcu_read_lock();
352         idev = __in6_dev_get(dst->dev);
353         if (idev)
354                 mtu = idev->cnf.mtu6;
355         rcu_read_unlock();
356
357         return mtu;
358 }
359
360 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
361 {
362         if (skb->len <= mtu)
363                 return false;
364
365         /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
366         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
367                 return true;
368
369         if (skb->ignore_df)
370                 return false;
371
372         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
373                 return false;
374
375         return true;
376 }
377
378 int ip6_forward(struct sk_buff *skb)
379 {
380         struct dst_entry *dst = skb_dst(skb);
381         struct ipv6hdr *hdr = ipv6_hdr(skb);
382         struct inet6_skb_parm *opt = IP6CB(skb);
383         struct net *net = dev_net(dst->dev);
384         u32 mtu;
385
386         if (net->ipv6.devconf_all->forwarding == 0)
387                 goto error;
388
389         if (skb->pkt_type != PACKET_HOST)
390                 goto drop;
391
392         if (unlikely(skb->sk))
393                 goto drop;
394
395         if (skb_warn_if_lro(skb))
396                 goto drop;
397
398         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
400                                  IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         skb_forward_csum(skb);
405
406         /*
407          *      We DO NOT make any processing on
408          *      RA packets, pushing them to user level AS IS
409          *      without ane WARRANTY that application will be able
410          *      to interpret them. The reason is that we
411          *      cannot make anything clever here.
412          *
413          *      We are not end-node, so that if packet contains
414          *      AH/ESP, we cannot make anything.
415          *      Defragmentation also would be mistake, RA packets
416          *      cannot be fragmented, because there is no warranty
417          *      that different fragments will go along one path. --ANK
418          */
419         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
420                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
421                         return 0;
422         }
423
424         /*
425          *      check and decrement ttl
426          */
427         if (hdr->hop_limit <= 1) {
428                 /* Force OUTPUT device used as source address */
429                 skb->dev = dst->dev;
430                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
431                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
432                                  IPSTATS_MIB_INHDRERRORS);
433
434                 kfree_skb(skb);
435                 return -ETIMEDOUT;
436         }
437
438         /* XXX: idev->cnf.proxy_ndp? */
439         if (net->ipv6.devconf_all->proxy_ndp &&
440             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
441                 int proxied = ip6_forward_proxy_check(skb);
442                 if (proxied > 0)
443                         return ip6_input(skb);
444                 else if (proxied < 0) {
445                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
446                                          IPSTATS_MIB_INDISCARDS);
447                         goto drop;
448                 }
449         }
450
451         if (!xfrm6_route_forward(skb)) {
452                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
453                                  IPSTATS_MIB_INDISCARDS);
454                 goto drop;
455         }
456         dst = skb_dst(skb);
457
458         /* IPv6 specs say nothing about it, but it is clear that we cannot
459            send redirects to source routed frames.
460            We don't send redirects to frames decapsulated from IPsec.
461          */
462         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
463                 struct in6_addr *target = NULL;
464                 struct inet_peer *peer;
465                 struct rt6_info *rt;
466
467                 /*
468                  *      incoming and outgoing devices are the same
469                  *      send a redirect.
470                  */
471
472                 rt = (struct rt6_info *) dst;
473                 if (rt->rt6i_flags & RTF_GATEWAY)
474                         target = &rt->rt6i_gateway;
475                 else
476                         target = &hdr->daddr;
477
478                 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
479
480                 /* Limit redirects both by destination (here)
481                    and by source (inside ndisc_send_redirect)
482                  */
483                 if (inet_peer_xrlim_allow(peer, 1*HZ))
484                         ndisc_send_redirect(skb, target);
485                 if (peer)
486                         inet_putpeer(peer);
487         } else {
488                 int addrtype = ipv6_addr_type(&hdr->saddr);
489
490                 /* This check is security critical. */
491                 if (addrtype == IPV6_ADDR_ANY ||
492                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493                         goto error;
494                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496                                     ICMPV6_NOT_NEIGHBOUR, 0);
497                         goto error;
498                 }
499         }
500
501         mtu = ip6_dst_mtu_forward(dst);
502         if (mtu < IPV6_MIN_MTU)
503                 mtu = IPV6_MIN_MTU;
504
505         if (ip6_pkt_too_big(skb, mtu)) {
506                 /* Again, force OUTPUT device used as source address */
507                 skb->dev = dst->dev;
508                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
510                                  IPSTATS_MIB_INTOOBIGERRORS);
511                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
512                                  IPSTATS_MIB_FRAGFAILS);
513                 kfree_skb(skb);
514                 return -EMSGSIZE;
515         }
516
517         if (skb_cow(skb, dst->dev->hard_header_len)) {
518                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
519                                  IPSTATS_MIB_OUTDISCARDS);
520                 goto drop;
521         }
522
523         hdr = ipv6_hdr(skb);
524
525         /* Mangling hops number delayed to point after skb COW */
526
527         hdr->hop_limit--;
528
529         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
530         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
532                        net, NULL, skb, skb->dev, dst->dev,
533                        ip6_forward_finish);
534
535 error:
536         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
537 drop:
538         kfree_skb(skb);
539         return -EINVAL;
540 }
541
542 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 {
544         to->pkt_type = from->pkt_type;
545         to->priority = from->priority;
546         to->protocol = from->protocol;
547         skb_dst_drop(to);
548         skb_dst_set(to, dst_clone(skb_dst(from)));
549         to->dev = from->dev;
550         to->mark = from->mark;
551
552 #ifdef CONFIG_NET_SCHED
553         to->tc_index = from->tc_index;
554 #endif
555         nf_copy(to, from);
556         skb_copy_secmark(to, from);
557 }
558
559 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
560                  int (*output)(struct net *, struct sock *, struct sk_buff *))
561 {
562         struct sk_buff *frag;
563         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
564         struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
565                                 inet6_sk(skb->sk) : NULL;
566         struct ipv6hdr *tmp_hdr;
567         struct frag_hdr *fh;
568         unsigned int mtu, hlen, left, len;
569         int hroom, troom;
570         __be32 frag_id;
571         int ptr, offset = 0, err = 0;
572         u8 *prevhdr, nexthdr = 0;
573
574         hlen = ip6_find_1stfragopt(skb, &prevhdr);
575         nexthdr = *prevhdr;
576
577         mtu = ip6_skb_dst_mtu(skb);
578
579         /* We must not fragment if the socket is set to force MTU discovery
580          * or if the skb it not generated by a local socket.
581          */
582         if (unlikely(!skb->ignore_df && skb->len > mtu))
583                 goto fail_toobig;
584
585         if (IP6CB(skb)->frag_max_size) {
586                 if (IP6CB(skb)->frag_max_size > mtu)
587                         goto fail_toobig;
588
589                 /* don't send fragments larger than what we received */
590                 mtu = IP6CB(skb)->frag_max_size;
591                 if (mtu < IPV6_MIN_MTU)
592                         mtu = IPV6_MIN_MTU;
593         }
594
595         if (np && np->frag_size < mtu) {
596                 if (np->frag_size)
597                         mtu = np->frag_size;
598         }
599         mtu -= hlen + sizeof(struct frag_hdr);
600
601         frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
602                                     &ipv6_hdr(skb)->saddr);
603
604         hroom = LL_RESERVED_SPACE(rt->dst.dev);
605         if (skb_has_frag_list(skb)) {
606                 int first_len = skb_pagelen(skb);
607                 struct sk_buff *frag2;
608
609                 if (first_len - hlen > mtu ||
610                     ((first_len - hlen) & 7) ||
611                     skb_cloned(skb) ||
612                     skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
613                         goto slow_path;
614
615                 skb_walk_frags(skb, frag) {
616                         /* Correct geometry. */
617                         if (frag->len > mtu ||
618                             ((frag->len & 7) && frag->next) ||
619                             skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
620                                 goto slow_path_clean;
621
622                         /* Partially cloned skb? */
623                         if (skb_shared(frag))
624                                 goto slow_path_clean;
625
626                         BUG_ON(frag->sk);
627                         if (skb->sk) {
628                                 frag->sk = skb->sk;
629                                 frag->destructor = sock_wfree;
630                         }
631                         skb->truesize -= frag->truesize;
632                 }
633
634                 err = 0;
635                 offset = 0;
636                 /* BUILD HEADER */
637
638                 *prevhdr = NEXTHDR_FRAGMENT;
639                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
640                 if (!tmp_hdr) {
641                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
642                                       IPSTATS_MIB_FRAGFAILS);
643                         err = -ENOMEM;
644                         goto fail;
645                 }
646                 frag = skb_shinfo(skb)->frag_list;
647                 skb_frag_list_init(skb);
648
649                 __skb_pull(skb, hlen);
650                 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
651                 __skb_push(skb, hlen);
652                 skb_reset_network_header(skb);
653                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
654
655                 fh->nexthdr = nexthdr;
656                 fh->reserved = 0;
657                 fh->frag_off = htons(IP6_MF);
658                 fh->identification = frag_id;
659
660                 first_len = skb_pagelen(skb);
661                 skb->data_len = first_len - skb_headlen(skb);
662                 skb->len = first_len;
663                 ipv6_hdr(skb)->payload_len = htons(first_len -
664                                                    sizeof(struct ipv6hdr));
665
666                 dst_hold(&rt->dst);
667
668                 for (;;) {
669                         /* Prepare header of the next frame,
670                          * before previous one went down. */
671                         if (frag) {
672                                 frag->ip_summed = CHECKSUM_NONE;
673                                 skb_reset_transport_header(frag);
674                                 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
675                                 __skb_push(frag, hlen);
676                                 skb_reset_network_header(frag);
677                                 memcpy(skb_network_header(frag), tmp_hdr,
678                                        hlen);
679                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
680                                 fh->nexthdr = nexthdr;
681                                 fh->reserved = 0;
682                                 fh->frag_off = htons(offset);
683                                 if (frag->next)
684                                         fh->frag_off |= htons(IP6_MF);
685                                 fh->identification = frag_id;
686                                 ipv6_hdr(frag)->payload_len =
687                                                 htons(frag->len -
688                                                       sizeof(struct ipv6hdr));
689                                 ip6_copy_metadata(frag, skb);
690                         }
691
692                         err = output(net, sk, skb);
693                         if (!err)
694                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
695                                               IPSTATS_MIB_FRAGCREATES);
696
697                         if (err || !frag)
698                                 break;
699
700                         skb = frag;
701                         frag = skb->next;
702                         skb->next = NULL;
703                 }
704
705                 kfree(tmp_hdr);
706
707                 if (err == 0) {
708                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
709                                       IPSTATS_MIB_FRAGOKS);
710                         ip6_rt_put(rt);
711                         return 0;
712                 }
713
714                 kfree_skb_list(frag);
715
716                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
717                               IPSTATS_MIB_FRAGFAILS);
718                 ip6_rt_put(rt);
719                 return err;
720
721 slow_path_clean:
722                 skb_walk_frags(skb, frag2) {
723                         if (frag2 == frag)
724                                 break;
725                         frag2->sk = NULL;
726                         frag2->destructor = NULL;
727                         skb->truesize += frag2->truesize;
728                 }
729         }
730
731 slow_path:
732         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
733             skb_checksum_help(skb))
734                 goto fail;
735
736         left = skb->len - hlen;         /* Space per frame */
737         ptr = hlen;                     /* Where to start from */
738
739         /*
740          *      Fragment the datagram.
741          */
742
743         *prevhdr = NEXTHDR_FRAGMENT;
744         troom = rt->dst.dev->needed_tailroom;
745
746         /*
747          *      Keep copying data until we run out.
748          */
749         while (left > 0)        {
750                 len = left;
751                 /* IF: it doesn't fit, use 'mtu' - the data space left */
752                 if (len > mtu)
753                         len = mtu;
754                 /* IF: we are not sending up to and including the packet end
755                    then align the next start on an eight byte boundary */
756                 if (len < left) {
757                         len &= ~7;
758                 }
759
760                 /* Allocate buffer */
761                 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
762                                  hroom + troom, GFP_ATOMIC);
763                 if (!frag) {
764                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
765                                       IPSTATS_MIB_FRAGFAILS);
766                         err = -ENOMEM;
767                         goto fail;
768                 }
769
770                 /*
771                  *      Set up data on packet
772                  */
773
774                 ip6_copy_metadata(frag, skb);
775                 skb_reserve(frag, hroom);
776                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
777                 skb_reset_network_header(frag);
778                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
779                 frag->transport_header = (frag->network_header + hlen +
780                                           sizeof(struct frag_hdr));
781
782                 /*
783                  *      Charge the memory for the fragment to any owner
784                  *      it might possess
785                  */
786                 if (skb->sk)
787                         skb_set_owner_w(frag, skb->sk);
788
789                 /*
790                  *      Copy the packet header into the new buffer.
791                  */
792                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
793
794                 /*
795                  *      Build fragment header.
796                  */
797                 fh->nexthdr = nexthdr;
798                 fh->reserved = 0;
799                 fh->identification = frag_id;
800
801                 /*
802                  *      Copy a block of the IP datagram.
803                  */
804                 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
805                                      len));
806                 left -= len;
807
808                 fh->frag_off = htons(offset);
809                 if (left > 0)
810                         fh->frag_off |= htons(IP6_MF);
811                 ipv6_hdr(frag)->payload_len = htons(frag->len -
812                                                     sizeof(struct ipv6hdr));
813
814                 ptr += len;
815                 offset += len;
816
817                 /*
818                  *      Put this fragment into the sending queue.
819                  */
820                 err = output(net, sk, frag);
821                 if (err)
822                         goto fail;
823
824                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
825                               IPSTATS_MIB_FRAGCREATES);
826         }
827         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
828                       IPSTATS_MIB_FRAGOKS);
829         consume_skb(skb);
830         return err;
831
832 fail_toobig:
833         if (skb->sk && dst_allfrag(skb_dst(skb)))
834                 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
835
836         skb->dev = skb_dst(skb)->dev;
837         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
838         err = -EMSGSIZE;
839
840 fail:
841         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
842                       IPSTATS_MIB_FRAGFAILS);
843         kfree_skb(skb);
844         return err;
845 }
846
847 static inline int ip6_rt_check(const struct rt6key *rt_key,
848                                const struct in6_addr *fl_addr,
849                                const struct in6_addr *addr_cache)
850 {
851         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
852                 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
853 }
854
855 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
856                                           struct dst_entry *dst,
857                                           const struct flowi6 *fl6)
858 {
859         struct ipv6_pinfo *np = inet6_sk(sk);
860         struct rt6_info *rt;
861
862         if (!dst)
863                 goto out;
864
865         if (dst->ops->family != AF_INET6) {
866                 dst_release(dst);
867                 return NULL;
868         }
869
870         rt = (struct rt6_info *)dst;
871         /* Yes, checking route validity in not connected
872          * case is not very simple. Take into account,
873          * that we do not support routing by source, TOS,
874          * and MSG_DONTROUTE            --ANK (980726)
875          *
876          * 1. ip6_rt_check(): If route was host route,
877          *    check that cached destination is current.
878          *    If it is network route, we still may
879          *    check its validity using saved pointer
880          *    to the last used address: daddr_cache.
881          *    We do not want to save whole address now,
882          *    (because main consumer of this service
883          *    is tcp, which has not this problem),
884          *    so that the last trick works only on connected
885          *    sockets.
886          * 2. oif also should be the same.
887          */
888         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
889 #ifdef CONFIG_IPV6_SUBTREES
890             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
891 #endif
892            (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
893               (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
894                 dst_release(dst);
895                 dst = NULL;
896         }
897
898 out:
899         return dst;
900 }
901
902 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
903                                struct dst_entry **dst, struct flowi6 *fl6)
904 {
905 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
906         struct neighbour *n;
907         struct rt6_info *rt;
908 #endif
909         int err;
910
911         /* The correct way to handle this would be to do
912          * ip6_route_get_saddr, and then ip6_route_output; however,
913          * the route-specific preferred source forces the
914          * ip6_route_output call _before_ ip6_route_get_saddr.
915          *
916          * In source specific routing (no src=any default route),
917          * ip6_route_output will fail given src=any saddr, though, so
918          * that's why we try it again later.
919          */
920         if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
921                 struct rt6_info *rt;
922                 bool had_dst = *dst != NULL;
923
924                 if (!had_dst)
925                         *dst = ip6_route_output(net, sk, fl6);
926                 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
927                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
928                                           sk ? inet6_sk(sk)->srcprefs : 0,
929                                           &fl6->saddr);
930                 if (err)
931                         goto out_err_release;
932
933                 /* If we had an erroneous initial result, pretend it
934                  * never existed and let the SA-enabled version take
935                  * over.
936                  */
937                 if (!had_dst && (*dst)->error) {
938                         dst_release(*dst);
939                         *dst = NULL;
940                 }
941         }
942
943         if (!*dst)
944                 *dst = ip6_route_output(net, sk, fl6);
945
946         err = (*dst)->error;
947         if (err)
948                 goto out_err_release;
949
950 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
951         /*
952          * Here if the dst entry we've looked up
953          * has a neighbour entry that is in the INCOMPLETE
954          * state and the src address from the flow is
955          * marked as OPTIMISTIC, we release the found
956          * dst entry and replace it instead with the
957          * dst entry of the nexthop router
958          */
959         rt = (struct rt6_info *) *dst;
960         rcu_read_lock_bh();
961         n = __ipv6_neigh_lookup_noref(rt->dst.dev,
962                                       rt6_nexthop(rt, &fl6->daddr));
963         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
964         rcu_read_unlock_bh();
965
966         if (err) {
967                 struct inet6_ifaddr *ifp;
968                 struct flowi6 fl_gw6;
969                 int redirect;
970
971                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
972                                       (*dst)->dev, 1);
973
974                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
975                 if (ifp)
976                         in6_ifa_put(ifp);
977
978                 if (redirect) {
979                         /*
980                          * We need to get the dst entry for the
981                          * default router instead
982                          */
983                         dst_release(*dst);
984                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
985                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
986                         *dst = ip6_route_output(net, sk, &fl_gw6);
987                         err = (*dst)->error;
988                         if (err)
989                                 goto out_err_release;
990                 }
991         }
992 #endif
993
994         return 0;
995
996 out_err_release:
997         if (err == -ENETUNREACH)
998                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
999         dst_release(*dst);
1000         *dst = NULL;
1001         return err;
1002 }
1003
1004 /**
1005  *      ip6_dst_lookup - perform route lookup on flow
1006  *      @sk: socket which provides route info
1007  *      @dst: pointer to dst_entry * for result
1008  *      @fl6: flow to lookup
1009  *
1010  *      This function performs a route lookup on the given flow.
1011  *
1012  *      It returns zero on success, or a standard errno code on error.
1013  */
1014 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1015                    struct flowi6 *fl6)
1016 {
1017         *dst = NULL;
1018         return ip6_dst_lookup_tail(net, sk, dst, fl6);
1019 }
1020 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1021
1022 /**
1023  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1024  *      @sk: socket which provides route info
1025  *      @fl6: flow to lookup
1026  *      @final_dst: final destination address for ipsec lookup
1027  *
1028  *      This function performs a route lookup on the given flow.
1029  *
1030  *      It returns a valid dst pointer on success, or a pointer encoded
1031  *      error code.
1032  */
1033 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1034                                       const struct in6_addr *final_dst)
1035 {
1036         struct dst_entry *dst = NULL;
1037         int err;
1038
1039         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1040         if (err)
1041                 return ERR_PTR(err);
1042         if (final_dst)
1043                 fl6->daddr = *final_dst;
1044         if (!fl6->flowi6_oif)
1045                 fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1046
1047         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1048 }
1049 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1050
1051 /**
1052  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1053  *      @sk: socket which provides the dst cache and route info
1054  *      @fl6: flow to lookup
1055  *      @final_dst: final destination address for ipsec lookup
1056  *
1057  *      This function performs a route lookup on the given flow with the
1058  *      possibility of using the cached route in the socket if it is valid.
1059  *      It will take the socket dst lock when operating on the dst cache.
1060  *      As a result, this function can only be used in process context.
1061  *
1062  *      It returns a valid dst pointer on success, or a pointer encoded
1063  *      error code.
1064  */
1065 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1066                                          const struct in6_addr *final_dst)
1067 {
1068         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1069         int err;
1070
1071         dst = ip6_sk_dst_check(sk, dst, fl6);
1072
1073         err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1074         if (err)
1075                 return ERR_PTR(err);
1076         if (final_dst)
1077                 fl6->daddr = *final_dst;
1078
1079         return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1080 }
1081 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1082
1083 static inline int ip6_ufo_append_data(struct sock *sk,
1084                         struct sk_buff_head *queue,
1085                         int getfrag(void *from, char *to, int offset, int len,
1086                         int odd, struct sk_buff *skb),
1087                         void *from, int length, int hh_len, int fragheaderlen,
1088                         int transhdrlen, int mtu, unsigned int flags,
1089                         const struct flowi6 *fl6)
1090
1091 {
1092         struct sk_buff *skb;
1093         int err;
1094
1095         /* There is support for UDP large send offload by network
1096          * device, so create one single skb packet containing complete
1097          * udp datagram
1098          */
1099         skb = skb_peek_tail(queue);
1100         if (!skb) {
1101                 skb = sock_alloc_send_skb(sk,
1102                         hh_len + fragheaderlen + transhdrlen + 20,
1103                         (flags & MSG_DONTWAIT), &err);
1104                 if (!skb)
1105                         return err;
1106
1107                 /* reserve space for Hardware header */
1108                 skb_reserve(skb, hh_len);
1109
1110                 /* create space for UDP/IP header */
1111                 skb_put(skb, fragheaderlen + transhdrlen);
1112
1113                 /* initialize network header pointer */
1114                 skb_reset_network_header(skb);
1115
1116                 /* initialize protocol header pointer */
1117                 skb->transport_header = skb->network_header + fragheaderlen;
1118
1119                 skb->protocol = htons(ETH_P_IPV6);
1120                 skb->csum = 0;
1121
1122                 __skb_queue_tail(queue, skb);
1123         } else if (skb_is_gso(skb)) {
1124                 goto append;
1125         }
1126
1127         skb->ip_summed = CHECKSUM_PARTIAL;
1128         /* Specify the length of each IPv6 datagram fragment.
1129          * It has to be a multiple of 8.
1130          */
1131         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1132                                      sizeof(struct frag_hdr)) & ~7;
1133         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1134         skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1135                                                          &fl6->daddr,
1136                                                          &fl6->saddr);
1137
1138 append:
1139         return skb_append_datato_frags(sk, skb, getfrag, from,
1140                                        (length - transhdrlen));
1141 }
1142
1143 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1144                                                gfp_t gfp)
1145 {
1146         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1147 }
1148
1149 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1150                                                 gfp_t gfp)
1151 {
1152         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1153 }
1154
1155 static void ip6_append_data_mtu(unsigned int *mtu,
1156                                 int *maxfraglen,
1157                                 unsigned int fragheaderlen,
1158                                 struct sk_buff *skb,
1159                                 struct rt6_info *rt,
1160                                 unsigned int orig_mtu)
1161 {
1162         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1163                 if (!skb) {
1164                         /* first fragment, reserve header_len */
1165                         *mtu = orig_mtu - rt->dst.header_len;
1166
1167                 } else {
1168                         /*
1169                          * this fragment is not first, the headers
1170                          * space is regarded as data space.
1171                          */
1172                         *mtu = orig_mtu;
1173                 }
1174                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1175                               + fragheaderlen - sizeof(struct frag_hdr);
1176         }
1177 }
1178
1179 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1180                           struct inet6_cork *v6_cork,
1181                           int hlimit, int tclass, struct ipv6_txoptions *opt,
1182                           struct rt6_info *rt, struct flowi6 *fl6)
1183 {
1184         struct ipv6_pinfo *np = inet6_sk(sk);
1185         unsigned int mtu;
1186
1187         /*
1188          * setup for corking
1189          */
1190         if (opt) {
1191                 if (WARN_ON(v6_cork->opt))
1192                         return -EINVAL;
1193
1194                 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1195                 if (unlikely(!v6_cork->opt))
1196                         return -ENOBUFS;
1197
1198                 v6_cork->opt->tot_len = opt->tot_len;
1199                 v6_cork->opt->opt_flen = opt->opt_flen;
1200                 v6_cork->opt->opt_nflen = opt->opt_nflen;
1201
1202                 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1203                                                     sk->sk_allocation);
1204                 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1205                         return -ENOBUFS;
1206
1207                 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1208                                                     sk->sk_allocation);
1209                 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1210                         return -ENOBUFS;
1211
1212                 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1213                                                    sk->sk_allocation);
1214                 if (opt->hopopt && !v6_cork->opt->hopopt)
1215                         return -ENOBUFS;
1216
1217                 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1218                                                     sk->sk_allocation);
1219                 if (opt->srcrt && !v6_cork->opt->srcrt)
1220                         return -ENOBUFS;
1221
1222                 /* need source address above miyazawa*/
1223         }
1224         dst_hold(&rt->dst);
1225         cork->base.dst = &rt->dst;
1226         cork->fl.u.ip6 = *fl6;
1227         v6_cork->hop_limit = hlimit;
1228         v6_cork->tclass = tclass;
1229         if (rt->dst.flags & DST_XFRM_TUNNEL)
1230                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1231                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1232         else
1233                 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1234                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1235         if (np->frag_size < mtu) {
1236                 if (np->frag_size)
1237                         mtu = np->frag_size;
1238         }
1239         cork->base.fragsize = mtu;
1240         if (dst_allfrag(rt->dst.path))
1241                 cork->base.flags |= IPCORK_ALLFRAG;
1242         cork->base.length = 0;
1243
1244         return 0;
1245 }
1246
1247 static int __ip6_append_data(struct sock *sk,
1248                              struct flowi6 *fl6,
1249                              struct sk_buff_head *queue,
1250                              struct inet_cork *cork,
1251                              struct inet6_cork *v6_cork,
1252                              struct page_frag *pfrag,
1253                              int getfrag(void *from, char *to, int offset,
1254                                          int len, int odd, struct sk_buff *skb),
1255                              void *from, int length, int transhdrlen,
1256                              unsigned int flags, int dontfrag)
1257 {
1258         struct sk_buff *skb, *skb_prev = NULL;
1259         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1260         int exthdrlen = 0;
1261         int dst_exthdrlen = 0;
1262         int hh_len;
1263         int copy;
1264         int err;
1265         int offset = 0;
1266         __u8 tx_flags = 0;
1267         u32 tskey = 0;
1268         struct rt6_info *rt = (struct rt6_info *)cork->dst;
1269         struct ipv6_txoptions *opt = v6_cork->opt;
1270         int csummode = CHECKSUM_NONE;
1271
1272         skb = skb_peek_tail(queue);
1273         if (!skb) {
1274                 exthdrlen = opt ? opt->opt_flen : 0;
1275                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1276         }
1277
1278         mtu = cork->fragsize;
1279         orig_mtu = mtu;
1280
1281         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1282
1283         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1284                         (opt ? opt->opt_nflen : 0);
1285         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1286                      sizeof(struct frag_hdr);
1287
1288         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1289                 unsigned int maxnonfragsize, headersize;
1290
1291                 headersize = sizeof(struct ipv6hdr) +
1292                              (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1293                              (dst_allfrag(&rt->dst) ?
1294                               sizeof(struct frag_hdr) : 0) +
1295                              rt->rt6i_nfheader_len;
1296
1297                 if (ip6_sk_ignore_df(sk))
1298                         maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1299                 else
1300                         maxnonfragsize = mtu;
1301
1302                 /* dontfrag active */
1303                 if ((cork->length + length > mtu - headersize) && dontfrag &&
1304                     (sk->sk_protocol == IPPROTO_UDP ||
1305                      sk->sk_protocol == IPPROTO_RAW)) {
1306                         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1307                                                    sizeof(struct ipv6hdr));
1308                         goto emsgsize;
1309                 }
1310
1311                 if (cork->length + length > maxnonfragsize - headersize) {
1312 emsgsize:
1313                         ipv6_local_error(sk, EMSGSIZE, fl6,
1314                                          mtu - headersize +
1315                                          sizeof(struct ipv6hdr));
1316                         return -EMSGSIZE;
1317                 }
1318         }
1319
1320         if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1321                 sock_tx_timestamp(sk, &tx_flags);
1322                 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1323                     sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1324                         tskey = sk->sk_tskey++;
1325         }
1326
1327         /* If this is the first and only packet and device
1328          * supports checksum offloading, let's use it.
1329          * Use transhdrlen, same as IPv4, because partial
1330          * sums only work when transhdrlen is set.
1331          */
1332         if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1333             length + fragheaderlen < mtu &&
1334             rt->dst.dev->features & NETIF_F_V6_CSUM &&
1335             !exthdrlen)
1336                 csummode = CHECKSUM_PARTIAL;
1337         /*
1338          * Let's try using as much space as possible.
1339          * Use MTU if total length of the message fits into the MTU.
1340          * Otherwise, we need to reserve fragment header and
1341          * fragment alignment (= 8-15 octects, in total).
1342          *
1343          * Note that we may need to "move" the data from the tail of
1344          * of the buffer to the new fragment when we split
1345          * the message.
1346          *
1347          * FIXME: It may be fragmented into multiple chunks
1348          *        at once if non-fragmentable extension headers
1349          *        are too large.
1350          * --yoshfuji
1351          */
1352
1353         cork->length += length;
1354         if (((length > mtu) ||
1355              (skb && skb_is_gso(skb))) &&
1356             (sk->sk_protocol == IPPROTO_UDP) &&
1357             (rt->dst.dev->features & NETIF_F_UFO) &&
1358             (sk->sk_type == SOCK_DGRAM)) {
1359                 err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1360                                           hh_len, fragheaderlen,
1361                                           transhdrlen, mtu, flags, fl6);
1362                 if (err)
1363                         goto error;
1364                 return 0;
1365         }
1366
1367         if (!skb)
1368                 goto alloc_new_skb;
1369
1370         while (length > 0) {
1371                 /* Check if the remaining data fits into current packet. */
1372                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1373                 if (copy < length)
1374                         copy = maxfraglen - skb->len;
1375
1376                 if (copy <= 0) {
1377                         char *data;
1378                         unsigned int datalen;
1379                         unsigned int fraglen;
1380                         unsigned int fraggap;
1381                         unsigned int alloclen;
1382 alloc_new_skb:
1383                         /* There's no room in the current skb */
1384                         if (skb)
1385                                 fraggap = skb->len - maxfraglen;
1386                         else
1387                                 fraggap = 0;
1388                         /* update mtu and maxfraglen if necessary */
1389                         if (!skb || !skb_prev)
1390                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1391                                                     fragheaderlen, skb, rt,
1392                                                     orig_mtu);
1393
1394                         skb_prev = skb;
1395
1396                         /*
1397                          * If remaining data exceeds the mtu,
1398                          * we know we need more fragment(s).
1399                          */
1400                         datalen = length + fraggap;
1401
1402                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1403                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1404                         if ((flags & MSG_MORE) &&
1405                             !(rt->dst.dev->features&NETIF_F_SG))
1406                                 alloclen = mtu;
1407                         else
1408                                 alloclen = datalen + fragheaderlen;
1409
1410                         alloclen += dst_exthdrlen;
1411
1412                         if (datalen != length + fraggap) {
1413                                 /*
1414                                  * this is not the last fragment, the trailer
1415                                  * space is regarded as data space.
1416                                  */
1417                                 datalen += rt->dst.trailer_len;
1418                         }
1419
1420                         alloclen += rt->dst.trailer_len;
1421                         fraglen = datalen + fragheaderlen;
1422
1423                         /*
1424                          * We just reserve space for fragment header.
1425                          * Note: this may be overallocation if the message
1426                          * (without MSG_MORE) fits into the MTU.
1427                          */
1428                         alloclen += sizeof(struct frag_hdr);
1429
1430                         if (transhdrlen) {
1431                                 skb = sock_alloc_send_skb(sk,
1432                                                 alloclen + hh_len,
1433                                                 (flags & MSG_DONTWAIT), &err);
1434                         } else {
1435                                 skb = NULL;
1436                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1437                                     2 * sk->sk_sndbuf)
1438                                         skb = sock_wmalloc(sk,
1439                                                            alloclen + hh_len, 1,
1440                                                            sk->sk_allocation);
1441                                 if (unlikely(!skb))
1442                                         err = -ENOBUFS;
1443                         }
1444                         if (!skb)
1445                                 goto error;
1446                         /*
1447                          *      Fill in the control structures
1448                          */
1449                         skb->protocol = htons(ETH_P_IPV6);
1450                         skb->ip_summed = csummode;
1451                         skb->csum = 0;
1452                         /* reserve for fragmentation and ipsec header */
1453                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1454                                     dst_exthdrlen);
1455
1456                         /* Only the initial fragment is time stamped */
1457                         skb_shinfo(skb)->tx_flags = tx_flags;
1458                         tx_flags = 0;
1459                         skb_shinfo(skb)->tskey = tskey;
1460                         tskey = 0;
1461
1462                         /*
1463                          *      Find where to start putting bytes
1464                          */
1465                         data = skb_put(skb, fraglen);
1466                         skb_set_network_header(skb, exthdrlen);
1467                         data += fragheaderlen;
1468                         skb->transport_header = (skb->network_header +
1469                                                  fragheaderlen);
1470                         if (fraggap) {
1471                                 skb->csum = skb_copy_and_csum_bits(
1472                                         skb_prev, maxfraglen,
1473                                         data + transhdrlen, fraggap, 0);
1474                                 skb_prev->csum = csum_sub(skb_prev->csum,
1475                                                           skb->csum);
1476                                 data += fraggap;
1477                                 pskb_trim_unique(skb_prev, maxfraglen);
1478                         }
1479                         copy = datalen - transhdrlen - fraggap;
1480
1481                         if (copy < 0) {
1482                                 err = -EINVAL;
1483                                 kfree_skb(skb);
1484                                 goto error;
1485                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1486                                 err = -EFAULT;
1487                                 kfree_skb(skb);
1488                                 goto error;
1489                         }
1490
1491                         offset += copy;
1492                         length -= datalen - fraggap;
1493                         transhdrlen = 0;
1494                         exthdrlen = 0;
1495                         dst_exthdrlen = 0;
1496
1497                         /*
1498                          * Put the packet on the pending queue
1499                          */
1500                         __skb_queue_tail(queue, skb);
1501                         continue;
1502                 }
1503
1504                 if (copy > length)
1505                         copy = length;
1506
1507                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1508                         unsigned int off;
1509
1510                         off = skb->len;
1511                         if (getfrag(from, skb_put(skb, copy),
1512                                                 offset, copy, off, skb) < 0) {
1513                                 __skb_trim(skb, off);
1514                                 err = -EFAULT;
1515                                 goto error;
1516                         }
1517                 } else {
1518                         int i = skb_shinfo(skb)->nr_frags;
1519
1520                         err = -ENOMEM;
1521                         if (!sk_page_frag_refill(sk, pfrag))
1522                                 goto error;
1523
1524                         if (!skb_can_coalesce(skb, i, pfrag->page,
1525                                               pfrag->offset)) {
1526                                 err = -EMSGSIZE;
1527                                 if (i == MAX_SKB_FRAGS)
1528                                         goto error;
1529
1530                                 __skb_fill_page_desc(skb, i, pfrag->page,
1531                                                      pfrag->offset, 0);
1532                                 skb_shinfo(skb)->nr_frags = ++i;
1533                                 get_page(pfrag->page);
1534                         }
1535                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1536                         if (getfrag(from,
1537                                     page_address(pfrag->page) + pfrag->offset,
1538                                     offset, copy, skb->len, skb) < 0)
1539                                 goto error_efault;
1540
1541                         pfrag->offset += copy;
1542                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1543                         skb->len += copy;
1544                         skb->data_len += copy;
1545                         skb->truesize += copy;
1546                         atomic_add(copy, &sk->sk_wmem_alloc);
1547                 }
1548                 offset += copy;
1549                 length -= copy;
1550         }
1551
1552         return 0;
1553
1554 error_efault:
1555         err = -EFAULT;
1556 error:
1557         cork->length -= length;
1558         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1559         return err;
1560 }
1561
1562 int ip6_append_data(struct sock *sk,
1563                     int getfrag(void *from, char *to, int offset, int len,
1564                                 int odd, struct sk_buff *skb),
1565                     void *from, int length, int transhdrlen, int hlimit,
1566                     int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1567                     struct rt6_info *rt, unsigned int flags, int dontfrag)
1568 {
1569         struct inet_sock *inet = inet_sk(sk);
1570         struct ipv6_pinfo *np = inet6_sk(sk);
1571         int exthdrlen;
1572         int err;
1573
1574         if (flags&MSG_PROBE)
1575                 return 0;
1576         if (skb_queue_empty(&sk->sk_write_queue)) {
1577                 /*
1578                  * setup for corking
1579                  */
1580                 err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1581                                      tclass, opt, rt, fl6);
1582                 if (err)
1583                         return err;
1584
1585                 exthdrlen = (opt ? opt->opt_flen : 0);
1586                 length += exthdrlen;
1587                 transhdrlen += exthdrlen;
1588         } else {
1589                 fl6 = &inet->cork.fl.u.ip6;
1590                 transhdrlen = 0;
1591         }
1592
1593         return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1594                                  &np->cork, sk_page_frag(sk), getfrag,
1595                                  from, length, transhdrlen, flags, dontfrag);
1596 }
1597 EXPORT_SYMBOL_GPL(ip6_append_data);
1598
1599 static void ip6_cork_release(struct inet_cork_full *cork,
1600                              struct inet6_cork *v6_cork)
1601 {
1602         if (v6_cork->opt) {
1603                 kfree(v6_cork->opt->dst0opt);
1604                 kfree(v6_cork->opt->dst1opt);
1605                 kfree(v6_cork->opt->hopopt);
1606                 kfree(v6_cork->opt->srcrt);
1607                 kfree(v6_cork->opt);
1608                 v6_cork->opt = NULL;
1609         }
1610
1611         if (cork->base.dst) {
1612                 dst_release(cork->base.dst);
1613                 cork->base.dst = NULL;
1614                 cork->base.flags &= ~IPCORK_ALLFRAG;
1615         }
1616         memset(&cork->fl, 0, sizeof(cork->fl));
1617 }
1618
1619 struct sk_buff *__ip6_make_skb(struct sock *sk,
1620                                struct sk_buff_head *queue,
1621                                struct inet_cork_full *cork,
1622                                struct inet6_cork *v6_cork)
1623 {
1624         struct sk_buff *skb, *tmp_skb;
1625         struct sk_buff **tail_skb;
1626         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1627         struct ipv6_pinfo *np = inet6_sk(sk);
1628         struct net *net = sock_net(sk);
1629         struct ipv6hdr *hdr;
1630         struct ipv6_txoptions *opt = v6_cork->opt;
1631         struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1632         struct flowi6 *fl6 = &cork->fl.u.ip6;
1633         unsigned char proto = fl6->flowi6_proto;
1634
1635         skb = __skb_dequeue(queue);
1636         if (!skb)
1637                 goto out;
1638         tail_skb = &(skb_shinfo(skb)->frag_list);
1639
1640         /* move skb->data to ip header from ext header */
1641         if (skb->data < skb_network_header(skb))
1642                 __skb_pull(skb, skb_network_offset(skb));
1643         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1644                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1645                 *tail_skb = tmp_skb;
1646                 tail_skb = &(tmp_skb->next);
1647                 skb->len += tmp_skb->len;
1648                 skb->data_len += tmp_skb->len;
1649                 skb->truesize += tmp_skb->truesize;
1650                 tmp_skb->destructor = NULL;
1651                 tmp_skb->sk = NULL;
1652         }
1653
1654         /* Allow local fragmentation. */
1655         skb->ignore_df = ip6_sk_ignore_df(sk);
1656
1657         *final_dst = fl6->daddr;
1658         __skb_pull(skb, skb_network_header_len(skb));
1659         if (opt && opt->opt_flen)
1660                 ipv6_push_frag_opts(skb, opt, &proto);
1661         if (opt && opt->opt_nflen)
1662                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1663
1664         skb_push(skb, sizeof(struct ipv6hdr));
1665         skb_reset_network_header(skb);
1666         hdr = ipv6_hdr(skb);
1667
1668         ip6_flow_hdr(hdr, v6_cork->tclass,
1669                      ip6_make_flowlabel(net, skb, fl6->flowlabel,
1670                                         np->autoflowlabel, fl6));
1671         hdr->hop_limit = v6_cork->hop_limit;
1672         hdr->nexthdr = proto;
1673         hdr->saddr = fl6->saddr;
1674         hdr->daddr = *final_dst;
1675
1676         skb->priority = sk->sk_priority;
1677         skb->mark = sk->sk_mark;
1678
1679         skb_dst_set(skb, dst_clone(&rt->dst));
1680         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1681         if (proto == IPPROTO_ICMPV6) {
1682                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1683
1684                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1685                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1686         }
1687
1688         ip6_cork_release(cork, v6_cork);
1689 out:
1690         return skb;
1691 }
1692
1693 int ip6_send_skb(struct sk_buff *skb)
1694 {
1695         struct net *net = sock_net(skb->sk);
1696         struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1697         int err;
1698
1699         err = ip6_local_out(net, skb->sk, skb);
1700         if (err) {
1701                 if (err > 0)
1702                         err = net_xmit_errno(err);
1703                 if (err)
1704                         IP6_INC_STATS(net, rt->rt6i_idev,
1705                                       IPSTATS_MIB_OUTDISCARDS);
1706         }
1707
1708         return err;
1709 }
1710
1711 int ip6_push_pending_frames(struct sock *sk)
1712 {
1713         struct sk_buff *skb;
1714
1715         skb = ip6_finish_skb(sk);
1716         if (!skb)
1717                 return 0;
1718
1719         return ip6_send_skb(skb);
1720 }
1721 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1722
1723 static void __ip6_flush_pending_frames(struct sock *sk,
1724                                        struct sk_buff_head *queue,
1725                                        struct inet_cork_full *cork,
1726                                        struct inet6_cork *v6_cork)
1727 {
1728         struct sk_buff *skb;
1729
1730         while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1731                 if (skb_dst(skb))
1732                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1733                                       IPSTATS_MIB_OUTDISCARDS);
1734                 kfree_skb(skb);
1735         }
1736
1737         ip6_cork_release(cork, v6_cork);
1738 }
1739
1740 void ip6_flush_pending_frames(struct sock *sk)
1741 {
1742         __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1743                                    &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1744 }
1745 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1746
1747 struct sk_buff *ip6_make_skb(struct sock *sk,
1748                              int getfrag(void *from, char *to, int offset,
1749                                          int len, int odd, struct sk_buff *skb),
1750                              void *from, int length, int transhdrlen,
1751                              int hlimit, int tclass,
1752                              struct ipv6_txoptions *opt, struct flowi6 *fl6,
1753                              struct rt6_info *rt, unsigned int flags,
1754                              int dontfrag)
1755 {
1756         struct inet_cork_full cork;
1757         struct inet6_cork v6_cork;
1758         struct sk_buff_head queue;
1759         int exthdrlen = (opt ? opt->opt_flen : 0);
1760         int err;
1761
1762         if (flags & MSG_PROBE)
1763                 return NULL;
1764
1765         __skb_queue_head_init(&queue);
1766
1767         cork.base.flags = 0;
1768         cork.base.addr = 0;
1769         cork.base.opt = NULL;
1770         v6_cork.opt = NULL;
1771         err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1772         if (err)
1773                 return ERR_PTR(err);
1774
1775         if (dontfrag < 0)
1776                 dontfrag = inet6_sk(sk)->dontfrag;
1777
1778         err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1779                                 &current->task_frag, getfrag, from,
1780                                 length + exthdrlen, transhdrlen + exthdrlen,
1781                                 flags, dontfrag);
1782         if (err) {
1783                 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1784                 return ERR_PTR(err);
1785         }
1786
1787         return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1788 }