]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/ipv4/tcp_ipv4.c
47c61055eb601ae73a68b899fdccf5bb8e1921b2
[karo-tx-linux.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87 /* Check TCP sequence numbers in ICMP packets. */
88 #define ICMP_MIN_LENGTH 8
89
90 /* Socket used for sending RSTs */
91 static struct socket *tcp_socket __read_mostly;
92
93 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
97                                                    __be32 addr);
98 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
99                                    __be32 saddr, __be32 daddr,
100                                    struct tcphdr *th, int protocol,
101                                    int tcplen);
102 #endif
103
104 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
105         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
106         .lhash_users = ATOMIC_INIT(0),
107         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
108 };
109
110 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
111 {
112         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
113                                  inet_csk_bind_conflict);
114 }
115
116 static void tcp_v4_hash(struct sock *sk)
117 {
118         inet_hash(&tcp_hashinfo, sk);
119 }
120
121 void tcp_unhash(struct sock *sk)
122 {
123         inet_unhash(&tcp_hashinfo, sk);
124 }
125
126 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
127 {
128         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
129                                           ip_hdr(skb)->saddr,
130                                           tcp_hdr(skb)->dest,
131                                           tcp_hdr(skb)->source);
132 }
133
134 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
135 {
136         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
137         struct tcp_sock *tp = tcp_sk(sk);
138
139         /* With PAWS, it is safe from the viewpoint
140            of data integrity. Even without PAWS it is safe provided sequence
141            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142
143            Actually, the idea is close to VJ's one, only timestamp cache is
144            held not per host, but per port pair and TW bucket is used as state
145            holder.
146
147            If TW bucket has been already destroyed we fall back to VJ's scheme
148            and use initial timestamp retrieved from peer table.
149          */
150         if (tcptw->tw_ts_recent_stamp &&
151             (twp == NULL || (sysctl_tcp_tw_reuse &&
152                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
153                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
154                 if (tp->write_seq == 0)
155                         tp->write_seq = 1;
156                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
157                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
158                 sock_hold(sktw);
159                 return 1;
160         }
161
162         return 0;
163 }
164
165 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
166
167 /* This will initiate an outgoing connection. */
168 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
169 {
170         struct inet_sock *inet = inet_sk(sk);
171         struct tcp_sock *tp = tcp_sk(sk);
172         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
173         struct rtable *rt;
174         __be32 daddr, nexthop;
175         int tmp;
176         int err;
177
178         if (addr_len < sizeof(struct sockaddr_in))
179                 return -EINVAL;
180
181         if (usin->sin_family != AF_INET)
182                 return -EAFNOSUPPORT;
183
184         nexthop = daddr = usin->sin_addr.s_addr;
185         if (inet->opt && inet->opt->srr) {
186                 if (!daddr)
187                         return -EINVAL;
188                 nexthop = inet->opt->faddr;
189         }
190
191         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
192                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
193                                IPPROTO_TCP,
194                                inet->sport, usin->sin_port, sk, 1);
195         if (tmp < 0) {
196                 if (tmp == -ENETUNREACH)
197                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
198                 return tmp;
199         }
200
201         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
202                 ip_rt_put(rt);
203                 return -ENETUNREACH;
204         }
205
206         if (!inet->opt || !inet->opt->srr)
207                 daddr = rt->rt_dst;
208
209         if (!inet->saddr)
210                 inet->saddr = rt->rt_src;
211         inet->rcv_saddr = inet->saddr;
212
213         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
214                 /* Reset inherited state */
215                 tp->rx_opt.ts_recent       = 0;
216                 tp->rx_opt.ts_recent_stamp = 0;
217                 tp->write_seq              = 0;
218         }
219
220         if (tcp_death_row.sysctl_tw_recycle &&
221             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
222                 struct inet_peer *peer = rt_get_peer(rt);
223                 /*
224                  * VJ's idea. We save last timestamp seen from
225                  * the destination in peer table, when entering state
226                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
227                  * when trying new connection.
228                  */
229                 if (peer != NULL &&
230                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
231                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
232                         tp->rx_opt.ts_recent = peer->tcp_ts;
233                 }
234         }
235
236         inet->dport = usin->sin_port;
237         inet->daddr = daddr;
238
239         inet_csk(sk)->icsk_ext_hdr_len = 0;
240         if (inet->opt)
241                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
242
243         tp->rx_opt.mss_clamp = 536;
244
245         /* Socket identity is still unknown (sport may be zero).
246          * However we set state to SYN-SENT and not releasing socket
247          * lock select source port, enter ourselves into the hash tables and
248          * complete initialization after this.
249          */
250         tcp_set_state(sk, TCP_SYN_SENT);
251         err = inet_hash_connect(&tcp_death_row, sk);
252         if (err)
253                 goto failure;
254
255         err = ip_route_newports(&rt, IPPROTO_TCP,
256                                 inet->sport, inet->dport, sk);
257         if (err)
258                 goto failure;
259
260         /* OK, now commit destination to socket.  */
261         sk->sk_gso_type = SKB_GSO_TCPV4;
262         sk_setup_caps(sk, &rt->u.dst);
263
264         if (!tp->write_seq)
265                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
266                                                            inet->daddr,
267                                                            inet->sport,
268                                                            usin->sin_port);
269
270         inet->id = tp->write_seq ^ jiffies;
271
272         err = tcp_connect(sk);
273         rt = NULL;
274         if (err)
275                 goto failure;
276
277         return 0;
278
279 failure:
280         /*
281          * This unhashes the socket and releases the local port,
282          * if necessary.
283          */
284         tcp_set_state(sk, TCP_CLOSE);
285         ip_rt_put(rt);
286         sk->sk_route_caps = 0;
287         inet->dport = 0;
288         return err;
289 }
290
291 /*
292  * This routine does path mtu discovery as defined in RFC1191.
293  */
294 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
295 {
296         struct dst_entry *dst;
297         struct inet_sock *inet = inet_sk(sk);
298
299         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
300          * send out by Linux are always <576bytes so they should go through
301          * unfragmented).
302          */
303         if (sk->sk_state == TCP_LISTEN)
304                 return;
305
306         /* We don't check in the destentry if pmtu discovery is forbidden
307          * on this route. We just assume that no packet_to_big packets
308          * are send back when pmtu discovery is not active.
309          * There is a small race when the user changes this flag in the
310          * route, but I think that's acceptable.
311          */
312         if ((dst = __sk_dst_check(sk, 0)) == NULL)
313                 return;
314
315         dst->ops->update_pmtu(dst, mtu);
316
317         /* Something is about to be wrong... Remember soft error
318          * for the case, if this connection will not able to recover.
319          */
320         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
321                 sk->sk_err_soft = EMSGSIZE;
322
323         mtu = dst_mtu(dst);
324
325         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
326             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
327                 tcp_sync_mss(sk, mtu);
328
329                 /* Resend the TCP packet because it's
330                  * clear that the old packet has been
331                  * dropped. This is the new "fast" path mtu
332                  * discovery.
333                  */
334                 tcp_simple_retransmit(sk);
335         } /* else let the usual retransmit timer handle it */
336 }
337
338 /*
339  * This routine is called by the ICMP module when it gets some
340  * sort of error condition.  If err < 0 then the socket should
341  * be closed and the error returned to the user.  If err > 0
342  * it's just the icmp type << 8 | icmp code.  After adjustment
343  * header points to the first 8 bytes of the tcp header.  We need
344  * to find the appropriate port.
345  *
346  * The locking strategy used here is very "optimistic". When
347  * someone else accesses the socket the ICMP is just dropped
348  * and for some paths there is no check at all.
349  * A more general error queue to queue errors for later handling
350  * is probably better.
351  *
352  */
353
354 void tcp_v4_err(struct sk_buff *skb, u32 info)
355 {
356         struct iphdr *iph = (struct iphdr *)skb->data;
357         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
358         struct tcp_sock *tp;
359         struct inet_sock *inet;
360         const int type = icmp_hdr(skb)->type;
361         const int code = icmp_hdr(skb)->code;
362         struct sock *sk;
363         __u32 seq;
364         int err;
365
366         if (skb->len < (iph->ihl << 2) + 8) {
367                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
368                 return;
369         }
370
371         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
372                          th->source, inet_iif(skb));
373         if (!sk) {
374                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
375                 return;
376         }
377         if (sk->sk_state == TCP_TIME_WAIT) {
378                 inet_twsk_put(inet_twsk(sk));
379                 return;
380         }
381
382         bh_lock_sock(sk);
383         /* If too many ICMPs get dropped on busy
384          * servers this needs to be solved differently.
385          */
386         if (sock_owned_by_user(sk))
387                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
388
389         if (sk->sk_state == TCP_CLOSE)
390                 goto out;
391
392         tp = tcp_sk(sk);
393         seq = ntohl(th->seq);
394         if (sk->sk_state != TCP_LISTEN &&
395             !between(seq, tp->snd_una, tp->snd_nxt)) {
396                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
397                 goto out;
398         }
399
400         switch (type) {
401         case ICMP_SOURCE_QUENCH:
402                 /* Just silently ignore these. */
403                 goto out;
404         case ICMP_PARAMETERPROB:
405                 err = EPROTO;
406                 break;
407         case ICMP_DEST_UNREACH:
408                 if (code > NR_ICMP_UNREACH)
409                         goto out;
410
411                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
412                         if (!sock_owned_by_user(sk))
413                                 do_pmtu_discovery(sk, iph, info);
414                         goto out;
415                 }
416
417                 err = icmp_err_convert[code].errno;
418                 break;
419         case ICMP_TIME_EXCEEDED:
420                 err = EHOSTUNREACH;
421                 break;
422         default:
423                 goto out;
424         }
425
426         switch (sk->sk_state) {
427                 struct request_sock *req, **prev;
428         case TCP_LISTEN:
429                 if (sock_owned_by_user(sk))
430                         goto out;
431
432                 req = inet_csk_search_req(sk, &prev, th->dest,
433                                           iph->daddr, iph->saddr);
434                 if (!req)
435                         goto out;
436
437                 /* ICMPs are not backlogged, hence we cannot get
438                    an established socket here.
439                  */
440                 BUG_TRAP(!req->sk);
441
442                 if (seq != tcp_rsk(req)->snt_isn) {
443                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
444                         goto out;
445                 }
446
447                 /*
448                  * Still in SYN_RECV, just remove it silently.
449                  * There is no good way to pass the error to the newly
450                  * created socket, and POSIX does not want network
451                  * errors returned from accept().
452                  */
453                 inet_csk_reqsk_queue_drop(sk, req, prev);
454                 goto out;
455
456         case TCP_SYN_SENT:
457         case TCP_SYN_RECV:  /* Cannot happen.
458                                It can f.e. if SYNs crossed.
459                              */
460                 if (!sock_owned_by_user(sk)) {
461                         sk->sk_err = err;
462
463                         sk->sk_error_report(sk);
464
465                         tcp_done(sk);
466                 } else {
467                         sk->sk_err_soft = err;
468                 }
469                 goto out;
470         }
471
472         /* If we've already connected we will keep trying
473          * until we time out, or the user gives up.
474          *
475          * rfc1122 4.2.3.9 allows to consider as hard errors
476          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
477          * but it is obsoleted by pmtu discovery).
478          *
479          * Note, that in modern internet, where routing is unreliable
480          * and in each dark corner broken firewalls sit, sending random
481          * errors ordered by their masters even this two messages finally lose
482          * their original sense (even Linux sends invalid PORT_UNREACHs)
483          *
484          * Now we are in compliance with RFCs.
485          *                                                      --ANK (980905)
486          */
487
488         inet = inet_sk(sk);
489         if (!sock_owned_by_user(sk) && inet->recverr) {
490                 sk->sk_err = err;
491                 sk->sk_error_report(sk);
492         } else  { /* Only an error on timeout */
493                 sk->sk_err_soft = err;
494         }
495
496 out:
497         bh_unlock_sock(sk);
498         sock_put(sk);
499 }
500
501 /* This routine computes an IPv4 TCP checksum. */
502 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
503 {
504         struct inet_sock *inet = inet_sk(sk);
505         struct tcphdr *th = tcp_hdr(skb);
506
507         if (skb->ip_summed == CHECKSUM_PARTIAL) {
508                 th->check = ~tcp_v4_check(len, inet->saddr,
509                                           inet->daddr, 0);
510                 skb->csum_start = skb_transport_header(skb) - skb->head;
511                 skb->csum_offset = offsetof(struct tcphdr, check);
512         } else {
513                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
514                                          csum_partial((char *)th,
515                                                       th->doff << 2,
516                                                       skb->csum));
517         }
518 }
519
520 int tcp_v4_gso_send_check(struct sk_buff *skb)
521 {
522         const struct iphdr *iph;
523         struct tcphdr *th;
524
525         if (!pskb_may_pull(skb, sizeof(*th)))
526                 return -EINVAL;
527
528         iph = ip_hdr(skb);
529         th = tcp_hdr(skb);
530
531         th->check = 0;
532         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
533         skb->csum_start = skb_transport_header(skb) - skb->head;
534         skb->csum_offset = offsetof(struct tcphdr, check);
535         skb->ip_summed = CHECKSUM_PARTIAL;
536         return 0;
537 }
538
539 /*
540  *      This routine will send an RST to the other tcp.
541  *
542  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
543  *                    for reset.
544  *      Answer: if a packet caused RST, it is not for a socket
545  *              existing in our system, if it is matched to a socket,
546  *              it is just duplicate segment or bug in other side's TCP.
547  *              So that we build reply only basing on parameters
548  *              arrived with segment.
549  *      Exception: precedence violation. We do not implement it in any case.
550  */
551
552 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
553 {
554         struct tcphdr *th = tcp_hdr(skb);
555         struct {
556                 struct tcphdr th;
557 #ifdef CONFIG_TCP_MD5SIG
558                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
559 #endif
560         } rep;
561         struct ip_reply_arg arg;
562 #ifdef CONFIG_TCP_MD5SIG
563         struct tcp_md5sig_key *key;
564 #endif
565
566         /* Never send a reset in response to a reset. */
567         if (th->rst)
568                 return;
569
570         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
571                 return;
572
573         /* Swap the send and the receive. */
574         memset(&rep, 0, sizeof(rep));
575         rep.th.dest   = th->source;
576         rep.th.source = th->dest;
577         rep.th.doff   = sizeof(struct tcphdr) / 4;
578         rep.th.rst    = 1;
579
580         if (th->ack) {
581                 rep.th.seq = th->ack_seq;
582         } else {
583                 rep.th.ack = 1;
584                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
585                                        skb->len - (th->doff << 2));
586         }
587
588         memset(&arg, 0, sizeof(arg));
589         arg.iov[0].iov_base = (unsigned char *)&rep;
590         arg.iov[0].iov_len  = sizeof(rep.th);
591
592 #ifdef CONFIG_TCP_MD5SIG
593         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
594         if (key) {
595                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
596                                    (TCPOPT_NOP << 16) |
597                                    (TCPOPT_MD5SIG << 8) |
598                                    TCPOLEN_MD5SIG);
599                 /* Update length and the length the header thinks exists */
600                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
601                 rep.th.doff = arg.iov[0].iov_len / 4;
602
603                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
604                                         key,
605                                         ip_hdr(skb)->daddr,
606                                         ip_hdr(skb)->saddr,
607                                         &rep.th, IPPROTO_TCP,
608                                         arg.iov[0].iov_len);
609         }
610 #endif
611         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
612                                       ip_hdr(skb)->saddr, /* XXX */
613                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
614         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
615
616         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
617
618         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
619         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
620 }
621
622 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
623    outside socket context is ugly, certainly. What can I do?
624  */
625
626 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
627                             struct sk_buff *skb, u32 seq, u32 ack,
628                             u32 win, u32 ts)
629 {
630         struct tcphdr *th = tcp_hdr(skb);
631         struct {
632                 struct tcphdr th;
633                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
634 #ifdef CONFIG_TCP_MD5SIG
635                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
636 #endif
637                         ];
638         } rep;
639         struct ip_reply_arg arg;
640 #ifdef CONFIG_TCP_MD5SIG
641         struct tcp_md5sig_key *key;
642         struct tcp_md5sig_key tw_key;
643 #endif
644
645         memset(&rep.th, 0, sizeof(struct tcphdr));
646         memset(&arg, 0, sizeof(arg));
647
648         arg.iov[0].iov_base = (unsigned char *)&rep;
649         arg.iov[0].iov_len  = sizeof(rep.th);
650         if (ts) {
651                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
652                                    (TCPOPT_TIMESTAMP << 8) |
653                                    TCPOLEN_TIMESTAMP);
654                 rep.opt[1] = htonl(tcp_time_stamp);
655                 rep.opt[2] = htonl(ts);
656                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
657         }
658
659         /* Swap the send and the receive. */
660         rep.th.dest    = th->source;
661         rep.th.source  = th->dest;
662         rep.th.doff    = arg.iov[0].iov_len / 4;
663         rep.th.seq     = htonl(seq);
664         rep.th.ack_seq = htonl(ack);
665         rep.th.ack     = 1;
666         rep.th.window  = htons(win);
667
668 #ifdef CONFIG_TCP_MD5SIG
669         /*
670          * The SKB holds an imcoming packet, but may not have a valid ->sk
671          * pointer. This is especially the case when we're dealing with a
672          * TIME_WAIT ack, because the sk structure is long gone, and only
673          * the tcp_timewait_sock remains. So the md5 key is stashed in that
674          * structure, and we use it in preference.  I believe that (twsk ||
675          * skb->sk) holds true, but we program defensively.
676          */
677         if (!twsk && skb->sk) {
678                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
679         } else if (twsk && twsk->tw_md5_keylen) {
680                 tw_key.key = twsk->tw_md5_key;
681                 tw_key.keylen = twsk->tw_md5_keylen;
682                 key = &tw_key;
683         } else
684                 key = NULL;
685
686         if (key) {
687                 int offset = (ts) ? 3 : 0;
688
689                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
690                                           (TCPOPT_NOP << 16) |
691                                           (TCPOPT_MD5SIG << 8) |
692                                           TCPOLEN_MD5SIG);
693                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
694                 rep.th.doff = arg.iov[0].iov_len/4;
695
696                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
697                                         key,
698                                         ip_hdr(skb)->daddr,
699                                         ip_hdr(skb)->saddr,
700                                         &rep.th, IPPROTO_TCP,
701                                         arg.iov[0].iov_len);
702         }
703 #endif
704         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
705                                       ip_hdr(skb)->saddr, /* XXX */
706                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
707         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
708
709         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
710
711         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
712 }
713
714 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
715 {
716         struct inet_timewait_sock *tw = inet_twsk(sk);
717         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
718
719         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
720                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
721                         tcptw->tw_ts_recent);
722
723         inet_twsk_put(tw);
724 }
725
726 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
727                                   struct request_sock *req)
728 {
729         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
730                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
731                         req->ts_recent);
732 }
733
734 /*
735  *      Send a SYN-ACK after having received an ACK.
736  *      This still operates on a request_sock only, not on a big
737  *      socket.
738  */
739 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
740                               struct dst_entry *dst)
741 {
742         const struct inet_request_sock *ireq = inet_rsk(req);
743         int err = -1;
744         struct sk_buff * skb;
745
746         /* First, grab a route. */
747         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
748                 goto out;
749
750         skb = tcp_make_synack(sk, dst, req);
751
752         if (skb) {
753                 struct tcphdr *th = tcp_hdr(skb);
754
755                 th->check = tcp_v4_check(skb->len,
756                                          ireq->loc_addr,
757                                          ireq->rmt_addr,
758                                          csum_partial((char *)th, skb->len,
759                                                       skb->csum));
760
761                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
762                                             ireq->rmt_addr,
763                                             ireq->opt);
764                 err = net_xmit_eval(err);
765         }
766
767 out:
768         dst_release(dst);
769         return err;
770 }
771
772 /*
773  *      IPv4 request_sock destructor.
774  */
775 static void tcp_v4_reqsk_destructor(struct request_sock *req)
776 {
777         kfree(inet_rsk(req)->opt);
778 }
779
780 #ifdef CONFIG_SYN_COOKIES
781 static void syn_flood_warning(struct sk_buff *skb)
782 {
783         static unsigned long warntime;
784
785         if (time_after(jiffies, (warntime + HZ * 60))) {
786                 warntime = jiffies;
787                 printk(KERN_INFO
788                        "possible SYN flooding on port %d. Sending cookies.\n",
789                        ntohs(tcp_hdr(skb)->dest));
790         }
791 }
792 #endif
793
794 /*
795  * Save and compile IPv4 options into the request_sock if needed.
796  */
797 static struct ip_options *tcp_v4_save_options(struct sock *sk,
798                                               struct sk_buff *skb)
799 {
800         struct ip_options *opt = &(IPCB(skb)->opt);
801         struct ip_options *dopt = NULL;
802
803         if (opt && opt->optlen) {
804                 int opt_size = optlength(opt);
805                 dopt = kmalloc(opt_size, GFP_ATOMIC);
806                 if (dopt) {
807                         if (ip_options_echo(dopt, skb)) {
808                                 kfree(dopt);
809                                 dopt = NULL;
810                         }
811                 }
812         }
813         return dopt;
814 }
815
816 #ifdef CONFIG_TCP_MD5SIG
817 /*
818  * RFC2385 MD5 checksumming requires a mapping of
819  * IP address->MD5 Key.
820  * We need to maintain these in the sk structure.
821  */
822
823 /* Find the Key structure for an address.  */
824 static struct tcp_md5sig_key *
825                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
826 {
827         struct tcp_sock *tp = tcp_sk(sk);
828         int i;
829
830         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
831                 return NULL;
832         for (i = 0; i < tp->md5sig_info->entries4; i++) {
833                 if (tp->md5sig_info->keys4[i].addr == addr)
834                         return (struct tcp_md5sig_key *)
835                                                 &tp->md5sig_info->keys4[i];
836         }
837         return NULL;
838 }
839
840 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
841                                          struct sock *addr_sk)
842 {
843         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
844 }
845
846 EXPORT_SYMBOL(tcp_v4_md5_lookup);
847
848 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
849                                                       struct request_sock *req)
850 {
851         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
852 }
853
854 /* This can be called on a newly created socket, from other files */
855 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
856                       u8 *newkey, u8 newkeylen)
857 {
858         /* Add Key to the list */
859         struct tcp4_md5sig_key *key;
860         struct tcp_sock *tp = tcp_sk(sk);
861         struct tcp4_md5sig_key *keys;
862
863         key = (struct tcp4_md5sig_key *)tcp_v4_md5_do_lookup(sk, addr);
864         if (key) {
865                 /* Pre-existing entry - just update that one. */
866                 kfree(key->key);
867                 key->key = newkey;
868                 key->keylen = newkeylen;
869         } else {
870                 struct tcp_md5sig_info *md5sig;
871
872                 if (!tp->md5sig_info) {
873                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
874                                                   GFP_ATOMIC);
875                         if (!tp->md5sig_info) {
876                                 kfree(newkey);
877                                 return -ENOMEM;
878                         }
879                 }
880                 if (tcp_alloc_md5sig_pool() == NULL) {
881                         kfree(newkey);
882                         return -ENOMEM;
883                 }
884                 md5sig = tp->md5sig_info;
885
886                 if (md5sig->alloced4 == md5sig->entries4) {
887                         keys = kmalloc((sizeof(*keys) *
888                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
889                         if (!keys) {
890                                 kfree(newkey);
891                                 tcp_free_md5sig_pool();
892                                 return -ENOMEM;
893                         }
894
895                         if (md5sig->entries4)
896                                 memcpy(keys, md5sig->keys4,
897                                        sizeof(*keys) * md5sig->entries4);
898
899                         /* Free old key list, and reference new one */
900                         if (md5sig->keys4)
901                                 kfree(md5sig->keys4);
902                         md5sig->keys4 = keys;
903                         md5sig->alloced4++;
904                 }
905                 md5sig->entries4++;
906                 md5sig->keys4[md5sig->entries4 - 1].addr   = addr;
907                 md5sig->keys4[md5sig->entries4 - 1].key    = newkey;
908                 md5sig->keys4[md5sig->entries4 - 1].keylen = newkeylen;
909         }
910         return 0;
911 }
912
913 EXPORT_SYMBOL(tcp_v4_md5_do_add);
914
915 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
916                                u8 *newkey, u8 newkeylen)
917 {
918         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
919                                  newkey, newkeylen);
920 }
921
922 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
923 {
924         struct tcp_sock *tp = tcp_sk(sk);
925         int i;
926
927         for (i = 0; i < tp->md5sig_info->entries4; i++) {
928                 if (tp->md5sig_info->keys4[i].addr == addr) {
929                         /* Free the key */
930                         kfree(tp->md5sig_info->keys4[i].key);
931                         tp->md5sig_info->entries4--;
932
933                         if (tp->md5sig_info->entries4 == 0) {
934                                 kfree(tp->md5sig_info->keys4);
935                                 tp->md5sig_info->keys4 = NULL;
936                                 tp->md5sig_info->alloced4 = 0;
937                         } else if (tp->md5sig_info->entries4 != i) {
938                                 /* Need to do some manipulation */
939                                 memcpy(&tp->md5sig_info->keys4[i],
940                                        &tp->md5sig_info->keys4[i+1],
941                                        (tp->md5sig_info->entries4 - i) *
942                                         sizeof(struct tcp4_md5sig_key));
943                         }
944                         tcp_free_md5sig_pool();
945                         return 0;
946                 }
947         }
948         return -ENOENT;
949 }
950
951 EXPORT_SYMBOL(tcp_v4_md5_do_del);
952
953 static void tcp_v4_clear_md5_list(struct sock *sk)
954 {
955         struct tcp_sock *tp = tcp_sk(sk);
956
957         /* Free each key, then the set of key keys,
958          * the crypto element, and then decrement our
959          * hold on the last resort crypto.
960          */
961         if (tp->md5sig_info->entries4) {
962                 int i;
963                 for (i = 0; i < tp->md5sig_info->entries4; i++)
964                         kfree(tp->md5sig_info->keys4[i].key);
965                 tp->md5sig_info->entries4 = 0;
966                 tcp_free_md5sig_pool();
967         }
968         if (tp->md5sig_info->keys4) {
969                 kfree(tp->md5sig_info->keys4);
970                 tp->md5sig_info->keys4 = NULL;
971                 tp->md5sig_info->alloced4  = 0;
972         }
973 }
974
975 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
976                                  int optlen)
977 {
978         struct tcp_md5sig cmd;
979         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
980         u8 *newkey;
981
982         if (optlen < sizeof(cmd))
983                 return -EINVAL;
984
985         if (copy_from_user(&cmd, optval, sizeof(cmd)))
986                 return -EFAULT;
987
988         if (sin->sin_family != AF_INET)
989                 return -EINVAL;
990
991         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
992                 if (!tcp_sk(sk)->md5sig_info)
993                         return -ENOENT;
994                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
995         }
996
997         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
998                 return -EINVAL;
999
1000         if (!tcp_sk(sk)->md5sig_info) {
1001                 struct tcp_sock *tp = tcp_sk(sk);
1002                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
1003
1004                 if (!p)
1005                         return -EINVAL;
1006
1007                 tp->md5sig_info = p;
1008
1009         }
1010
1011         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1012         if (!newkey)
1013                 return -ENOMEM;
1014         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1015                                  newkey, cmd.tcpm_keylen);
1016 }
1017
1018 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1019                                    __be32 saddr, __be32 daddr,
1020                                    struct tcphdr *th, int protocol,
1021                                    int tcplen)
1022 {
1023         struct scatterlist sg[4];
1024         __u16 data_len;
1025         int block = 0;
1026         __sum16 old_checksum;
1027         struct tcp_md5sig_pool *hp;
1028         struct tcp4_pseudohdr *bp;
1029         struct hash_desc *desc;
1030         int err;
1031         unsigned int nbytes = 0;
1032
1033         /*
1034          * Okay, so RFC2385 is turned on for this connection,
1035          * so we need to generate the MD5 hash for the packet now.
1036          */
1037
1038         hp = tcp_get_md5sig_pool();
1039         if (!hp)
1040                 goto clear_hash_noput;
1041
1042         bp = &hp->md5_blk.ip4;
1043         desc = &hp->md5_desc;
1044
1045         /*
1046          * 1. the TCP pseudo-header (in the order: source IP address,
1047          * destination IP address, zero-padded protocol number, and
1048          * segment length)
1049          */
1050         bp->saddr = saddr;
1051         bp->daddr = daddr;
1052         bp->pad = 0;
1053         bp->protocol = protocol;
1054         bp->len = htons(tcplen);
1055         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1056         nbytes += sizeof(*bp);
1057
1058         /* 2. the TCP header, excluding options, and assuming a
1059          * checksum of zero/
1060          */
1061         old_checksum = th->check;
1062         th->check = 0;
1063         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1064         nbytes += sizeof(struct tcphdr);
1065
1066         /* 3. the TCP segment data (if any) */
1067         data_len = tcplen - (th->doff << 2);
1068         if (data_len > 0) {
1069                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1070                 sg_set_buf(&sg[block++], data, data_len);
1071                 nbytes += data_len;
1072         }
1073
1074         /* 4. an independently-specified key or password, known to both
1075          * TCPs and presumably connection-specific
1076          */
1077         sg_set_buf(&sg[block++], key->key, key->keylen);
1078         nbytes += key->keylen;
1079
1080         /* Now store the Hash into the packet */
1081         err = crypto_hash_init(desc);
1082         if (err)
1083                 goto clear_hash;
1084         err = crypto_hash_update(desc, sg, nbytes);
1085         if (err)
1086                 goto clear_hash;
1087         err = crypto_hash_final(desc, md5_hash);
1088         if (err)
1089                 goto clear_hash;
1090
1091         /* Reset header, and free up the crypto */
1092         tcp_put_md5sig_pool();
1093         th->check = old_checksum;
1094
1095 out:
1096         return 0;
1097 clear_hash:
1098         tcp_put_md5sig_pool();
1099 clear_hash_noput:
1100         memset(md5_hash, 0, 16);
1101         goto out;
1102 }
1103
1104 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1105                          struct sock *sk,
1106                          struct dst_entry *dst,
1107                          struct request_sock *req,
1108                          struct tcphdr *th, int protocol,
1109                          int tcplen)
1110 {
1111         __be32 saddr, daddr;
1112
1113         if (sk) {
1114                 saddr = inet_sk(sk)->saddr;
1115                 daddr = inet_sk(sk)->daddr;
1116         } else {
1117                 struct rtable *rt = (struct rtable *)dst;
1118                 BUG_ON(!rt);
1119                 saddr = rt->rt_src;
1120                 daddr = rt->rt_dst;
1121         }
1122         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1123                                        saddr, daddr,
1124                                        th, protocol, tcplen);
1125 }
1126
1127 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1128
1129 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1130 {
1131         /*
1132          * This gets called for each TCP segment that arrives
1133          * so we want to be efficient.
1134          * We have 3 drop cases:
1135          * o No MD5 hash and one expected.
1136          * o MD5 hash and we're not expecting one.
1137          * o MD5 hash and its wrong.
1138          */
1139         __u8 *hash_location = NULL;
1140         struct tcp_md5sig_key *hash_expected;
1141         const struct iphdr *iph = ip_hdr(skb);
1142         struct tcphdr *th = tcp_hdr(skb);
1143         int length = (th->doff << 2) - sizeof(struct tcphdr);
1144         int genhash;
1145         unsigned char *ptr;
1146         unsigned char newhash[16];
1147
1148         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1149
1150         /*
1151          * If the TCP option length is less than the TCP_MD5SIG
1152          * option length, then we can shortcut
1153          */
1154         if (length < TCPOLEN_MD5SIG) {
1155                 if (hash_expected)
1156                         return 1;
1157                 else
1158                         return 0;
1159         }
1160
1161         /* Okay, we can't shortcut - we have to grub through the options */
1162         ptr = (unsigned char *)(th + 1);
1163         while (length > 0) {
1164                 int opcode = *ptr++;
1165                 int opsize;
1166
1167                 switch (opcode) {
1168                 case TCPOPT_EOL:
1169                         goto done_opts;
1170                 case TCPOPT_NOP:
1171                         length--;
1172                         continue;
1173                 default:
1174                         opsize = *ptr++;
1175                         if (opsize < 2)
1176                                 goto done_opts;
1177                         if (opsize > length)
1178                                 goto done_opts;
1179
1180                         if (opcode == TCPOPT_MD5SIG) {
1181                                 hash_location = ptr;
1182                                 goto done_opts;
1183                         }
1184                 }
1185                 ptr += opsize-2;
1186                 length -= opsize;
1187         }
1188 done_opts:
1189         /* We've parsed the options - do we have a hash? */
1190         if (!hash_expected && !hash_location)
1191                 return 0;
1192
1193         if (hash_expected && !hash_location) {
1194                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1195                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1196                                NIPQUAD(iph->saddr), ntohs(th->source),
1197                                NIPQUAD(iph->daddr), ntohs(th->dest));
1198                 return 1;
1199         }
1200
1201         if (!hash_expected && hash_location) {
1202                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1203                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1204                                NIPQUAD(iph->saddr), ntohs(th->source),
1205                                NIPQUAD(iph->daddr), ntohs(th->dest));
1206                 return 1;
1207         }
1208
1209         /* Okay, so this is hash_expected and hash_location -
1210          * so we need to calculate the checksum.
1211          */
1212         genhash = tcp_v4_do_calc_md5_hash(newhash,
1213                                           hash_expected,
1214                                           iph->saddr, iph->daddr,
1215                                           th, sk->sk_protocol,
1216                                           skb->len);
1217
1218         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1219                 if (net_ratelimit()) {
1220                         printk(KERN_INFO "MD5 Hash failed for "
1221                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1222                                NIPQUAD(iph->saddr), ntohs(th->source),
1223                                NIPQUAD(iph->daddr), ntohs(th->dest),
1224                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1225                 }
1226                 return 1;
1227         }
1228         return 0;
1229 }
1230
1231 #endif
1232
1233 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1234         .family         =       PF_INET,
1235         .obj_size       =       sizeof(struct tcp_request_sock),
1236         .rtx_syn_ack    =       tcp_v4_send_synack,
1237         .send_ack       =       tcp_v4_reqsk_send_ack,
1238         .destructor     =       tcp_v4_reqsk_destructor,
1239         .send_reset     =       tcp_v4_send_reset,
1240 };
1241
1242 #ifdef CONFIG_TCP_MD5SIG
1243 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1244         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1245 };
1246 #endif
1247
1248 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1249         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1250         .twsk_unique    = tcp_twsk_unique,
1251         .twsk_destructor= tcp_twsk_destructor,
1252 };
1253
1254 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1255 {
1256         struct inet_request_sock *ireq;
1257         struct tcp_options_received tmp_opt;
1258         struct request_sock *req;
1259         __be32 saddr = ip_hdr(skb)->saddr;
1260         __be32 daddr = ip_hdr(skb)->daddr;
1261         __u32 isn = TCP_SKB_CB(skb)->when;
1262         struct dst_entry *dst = NULL;
1263 #ifdef CONFIG_SYN_COOKIES
1264         int want_cookie = 0;
1265 #else
1266 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1267 #endif
1268
1269         /* Never answer to SYNs send to broadcast or multicast */
1270         if (((struct rtable *)skb->dst)->rt_flags &
1271             (RTCF_BROADCAST | RTCF_MULTICAST))
1272                 goto drop;
1273
1274         /* TW buckets are converted to open requests without
1275          * limitations, they conserve resources and peer is
1276          * evidently real one.
1277          */
1278         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1279 #ifdef CONFIG_SYN_COOKIES
1280                 if (sysctl_tcp_syncookies) {
1281                         want_cookie = 1;
1282                 } else
1283 #endif
1284                 goto drop;
1285         }
1286
1287         /* Accept backlog is full. If we have already queued enough
1288          * of warm entries in syn queue, drop request. It is better than
1289          * clogging syn queue with openreqs with exponentially increasing
1290          * timeout.
1291          */
1292         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1293                 goto drop;
1294
1295         req = reqsk_alloc(&tcp_request_sock_ops);
1296         if (!req)
1297                 goto drop;
1298
1299 #ifdef CONFIG_TCP_MD5SIG
1300         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1301 #endif
1302
1303         tcp_clear_options(&tmp_opt);
1304         tmp_opt.mss_clamp = 536;
1305         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1306
1307         tcp_parse_options(skb, &tmp_opt, 0);
1308
1309         if (want_cookie) {
1310                 tcp_clear_options(&tmp_opt);
1311                 tmp_opt.saw_tstamp = 0;
1312         }
1313
1314         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1315                 /* Some OSes (unknown ones, but I see them on web server, which
1316                  * contains information interesting only for windows'
1317                  * users) do not send their stamp in SYN. It is easy case.
1318                  * We simply do not advertise TS support.
1319                  */
1320                 tmp_opt.saw_tstamp = 0;
1321                 tmp_opt.tstamp_ok  = 0;
1322         }
1323         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1324
1325         tcp_openreq_init(req, &tmp_opt, skb);
1326
1327         if (security_inet_conn_request(sk, skb, req))
1328                 goto drop_and_free;
1329
1330         ireq = inet_rsk(req);
1331         ireq->loc_addr = daddr;
1332         ireq->rmt_addr = saddr;
1333         ireq->opt = tcp_v4_save_options(sk, skb);
1334         if (!want_cookie)
1335                 TCP_ECN_create_request(req, tcp_hdr(skb));
1336
1337         if (want_cookie) {
1338 #ifdef CONFIG_SYN_COOKIES
1339                 syn_flood_warning(skb);
1340 #endif
1341                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1342         } else if (!isn) {
1343                 struct inet_peer *peer = NULL;
1344
1345                 /* VJ's idea. We save last timestamp seen
1346                  * from the destination in peer table, when entering
1347                  * state TIME-WAIT, and check against it before
1348                  * accepting new connection request.
1349                  *
1350                  * If "isn" is not zero, this request hit alive
1351                  * timewait bucket, so that all the necessary checks
1352                  * are made in the function processing timewait state.
1353                  */
1354                 if (tmp_opt.saw_tstamp &&
1355                     tcp_death_row.sysctl_tw_recycle &&
1356                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1357                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1358                     peer->v4daddr == saddr) {
1359                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1360                             (s32)(peer->tcp_ts - req->ts_recent) >
1361                                                         TCP_PAWS_WINDOW) {
1362                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1363                                 dst_release(dst);
1364                                 goto drop_and_free;
1365                         }
1366                 }
1367                 /* Kill the following clause, if you dislike this way. */
1368                 else if (!sysctl_tcp_syncookies &&
1369                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1370                           (sysctl_max_syn_backlog >> 2)) &&
1371                          (!peer || !peer->tcp_ts_stamp) &&
1372                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1373                         /* Without syncookies last quarter of
1374                          * backlog is filled with destinations,
1375                          * proven to be alive.
1376                          * It means that we continue to communicate
1377                          * to destinations, already remembered
1378                          * to the moment of synflood.
1379                          */
1380                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1381                                        "request from %u.%u.%u.%u/%u\n",
1382                                        NIPQUAD(saddr),
1383                                        ntohs(tcp_hdr(skb)->source));
1384                         dst_release(dst);
1385                         goto drop_and_free;
1386                 }
1387
1388                 isn = tcp_v4_init_sequence(skb);
1389         }
1390         tcp_rsk(req)->snt_isn = isn;
1391
1392         if (tcp_v4_send_synack(sk, req, dst))
1393                 goto drop_and_free;
1394
1395         if (want_cookie) {
1396                 reqsk_free(req);
1397         } else {
1398                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1399         }
1400         return 0;
1401
1402 drop_and_free:
1403         reqsk_free(req);
1404 drop:
1405         return 0;
1406 }
1407
1408
1409 /*
1410  * The three way handshake has completed - we got a valid synack -
1411  * now create the new socket.
1412  */
1413 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1414                                   struct request_sock *req,
1415                                   struct dst_entry *dst)
1416 {
1417         struct inet_request_sock *ireq;
1418         struct inet_sock *newinet;
1419         struct tcp_sock *newtp;
1420         struct sock *newsk;
1421 #ifdef CONFIG_TCP_MD5SIG
1422         struct tcp_md5sig_key *key;
1423 #endif
1424
1425         if (sk_acceptq_is_full(sk))
1426                 goto exit_overflow;
1427
1428         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1429                 goto exit;
1430
1431         newsk = tcp_create_openreq_child(sk, req, skb);
1432         if (!newsk)
1433                 goto exit;
1434
1435         newsk->sk_gso_type = SKB_GSO_TCPV4;
1436         sk_setup_caps(newsk, dst);
1437
1438         newtp                 = tcp_sk(newsk);
1439         newinet               = inet_sk(newsk);
1440         ireq                  = inet_rsk(req);
1441         newinet->daddr        = ireq->rmt_addr;
1442         newinet->rcv_saddr    = ireq->loc_addr;
1443         newinet->saddr        = ireq->loc_addr;
1444         newinet->opt          = ireq->opt;
1445         ireq->opt             = NULL;
1446         newinet->mc_index     = inet_iif(skb);
1447         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1448         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1449         if (newinet->opt)
1450                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1451         newinet->id = newtp->write_seq ^ jiffies;
1452
1453         tcp_mtup_init(newsk);
1454         tcp_sync_mss(newsk, dst_mtu(dst));
1455         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1456         tcp_initialize_rcv_mss(newsk);
1457
1458 #ifdef CONFIG_TCP_MD5SIG
1459         /* Copy over the MD5 key from the original socket */
1460         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1461                 /*
1462                  * We're using one, so create a matching key
1463                  * on the newsk structure. If we fail to get
1464                  * memory, then we end up not copying the key
1465                  * across. Shucks.
1466                  */
1467                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1468                 if (newkey != NULL)
1469                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1470                                           newkey, key->keylen);
1471         }
1472 #endif
1473
1474         __inet_hash(&tcp_hashinfo, newsk, 0);
1475         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1476
1477         return newsk;
1478
1479 exit_overflow:
1480         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1481 exit:
1482         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1483         dst_release(dst);
1484         return NULL;
1485 }
1486
1487 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1488 {
1489         struct tcphdr *th = tcp_hdr(skb);
1490         const struct iphdr *iph = ip_hdr(skb);
1491         struct sock *nsk;
1492         struct request_sock **prev;
1493         /* Find possible connection requests. */
1494         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1495                                                        iph->saddr, iph->daddr);
1496         if (req)
1497                 return tcp_check_req(sk, skb, req, prev);
1498
1499         nsk = inet_lookup_established(&tcp_hashinfo, iph->saddr, th->source,
1500                                       iph->daddr, th->dest, inet_iif(skb));
1501
1502         if (nsk) {
1503                 if (nsk->sk_state != TCP_TIME_WAIT) {
1504                         bh_lock_sock(nsk);
1505                         return nsk;
1506                 }
1507                 inet_twsk_put(inet_twsk(nsk));
1508                 return NULL;
1509         }
1510
1511 #ifdef CONFIG_SYN_COOKIES
1512         if (!th->rst && !th->syn && th->ack)
1513                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1514 #endif
1515         return sk;
1516 }
1517
1518 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1519 {
1520         const struct iphdr *iph = ip_hdr(skb);
1521
1522         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1523                 if (!tcp_v4_check(skb->len, iph->saddr,
1524                                   iph->daddr, skb->csum)) {
1525                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1526                         return 0;
1527                 }
1528         }
1529
1530         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1531                                        skb->len, IPPROTO_TCP, 0);
1532
1533         if (skb->len <= 76) {
1534                 return __skb_checksum_complete(skb);
1535         }
1536         return 0;
1537 }
1538
1539
1540 /* The socket must have it's spinlock held when we get
1541  * here.
1542  *
1543  * We have a potential double-lock case here, so even when
1544  * doing backlog processing we use the BH locking scheme.
1545  * This is because we cannot sleep with the original spinlock
1546  * held.
1547  */
1548 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1549 {
1550         struct sock *rsk;
1551 #ifdef CONFIG_TCP_MD5SIG
1552         /*
1553          * We really want to reject the packet as early as possible
1554          * if:
1555          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1556          *  o There is an MD5 option and we're not expecting one
1557          */
1558         if (tcp_v4_inbound_md5_hash(sk, skb))
1559                 goto discard;
1560 #endif
1561
1562         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1563                 TCP_CHECK_TIMER(sk);
1564                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1565                         rsk = sk;
1566                         goto reset;
1567                 }
1568                 TCP_CHECK_TIMER(sk);
1569                 return 0;
1570         }
1571
1572         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1573                 goto csum_err;
1574
1575         if (sk->sk_state == TCP_LISTEN) {
1576                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1577                 if (!nsk)
1578                         goto discard;
1579
1580                 if (nsk != sk) {
1581                         if (tcp_child_process(sk, nsk, skb)) {
1582                                 rsk = nsk;
1583                                 goto reset;
1584                         }
1585                         return 0;
1586                 }
1587         }
1588
1589         TCP_CHECK_TIMER(sk);
1590         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1591                 rsk = sk;
1592                 goto reset;
1593         }
1594         TCP_CHECK_TIMER(sk);
1595         return 0;
1596
1597 reset:
1598         tcp_v4_send_reset(rsk, skb);
1599 discard:
1600         kfree_skb(skb);
1601         /* Be careful here. If this function gets more complicated and
1602          * gcc suffers from register pressure on the x86, sk (in %ebx)
1603          * might be destroyed here. This current version compiles correctly,
1604          * but you have been warned.
1605          */
1606         return 0;
1607
1608 csum_err:
1609         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1610         goto discard;
1611 }
1612
1613 /*
1614  *      From tcp_input.c
1615  */
1616
1617 int tcp_v4_rcv(struct sk_buff *skb)
1618 {
1619         const struct iphdr *iph;
1620         struct tcphdr *th;
1621         struct sock *sk;
1622         int ret;
1623
1624         if (skb->pkt_type != PACKET_HOST)
1625                 goto discard_it;
1626
1627         /* Count it even if it's bad */
1628         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1629
1630         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1631                 goto discard_it;
1632
1633         th = tcp_hdr(skb);
1634
1635         if (th->doff < sizeof(struct tcphdr) / 4)
1636                 goto bad_packet;
1637         if (!pskb_may_pull(skb, th->doff * 4))
1638                 goto discard_it;
1639
1640         /* An explanation is required here, I think.
1641          * Packet length and doff are validated by header prediction,
1642          * provided case of th->doff==0 is eliminated.
1643          * So, we defer the checks. */
1644         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1645                 goto bad_packet;
1646
1647         th = tcp_hdr(skb);
1648         iph = ip_hdr(skb);
1649         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1650         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1651                                     skb->len - th->doff * 4);
1652         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1653         TCP_SKB_CB(skb)->when    = 0;
1654         TCP_SKB_CB(skb)->flags   = iph->tos;
1655         TCP_SKB_CB(skb)->sacked  = 0;
1656
1657         sk = __inet_lookup(&tcp_hashinfo, iph->saddr, th->source,
1658                            iph->daddr, th->dest, inet_iif(skb));
1659         if (!sk)
1660                 goto no_tcp_socket;
1661
1662 process:
1663         if (sk->sk_state == TCP_TIME_WAIT)
1664                 goto do_time_wait;
1665
1666         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1667                 goto discard_and_relse;
1668         nf_reset(skb);
1669
1670         if (sk_filter(sk, skb))
1671                 goto discard_and_relse;
1672
1673         skb->dev = NULL;
1674
1675         bh_lock_sock_nested(sk);
1676         ret = 0;
1677         if (!sock_owned_by_user(sk)) {
1678 #ifdef CONFIG_NET_DMA
1679                 struct tcp_sock *tp = tcp_sk(sk);
1680                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1681                         tp->ucopy.dma_chan = get_softnet_dma();
1682                 if (tp->ucopy.dma_chan)
1683                         ret = tcp_v4_do_rcv(sk, skb);
1684                 else
1685 #endif
1686                 {
1687                         if (!tcp_prequeue(sk, skb))
1688                         ret = tcp_v4_do_rcv(sk, skb);
1689                 }
1690         } else
1691                 sk_add_backlog(sk, skb);
1692         bh_unlock_sock(sk);
1693
1694         sock_put(sk);
1695
1696         return ret;
1697
1698 no_tcp_socket:
1699         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1700                 goto discard_it;
1701
1702         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1703 bad_packet:
1704                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1705         } else {
1706                 tcp_v4_send_reset(NULL, skb);
1707         }
1708
1709 discard_it:
1710         /* Discard frame. */
1711         kfree_skb(skb);
1712         return 0;
1713
1714 discard_and_relse:
1715         sock_put(sk);
1716         goto discard_it;
1717
1718 do_time_wait:
1719         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1720                 inet_twsk_put(inet_twsk(sk));
1721                 goto discard_it;
1722         }
1723
1724         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1725                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1726                 inet_twsk_put(inet_twsk(sk));
1727                 goto discard_it;
1728         }
1729         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1730         case TCP_TW_SYN: {
1731                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1732                                                         iph->daddr, th->dest,
1733                                                         inet_iif(skb));
1734                 if (sk2) {
1735                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1736                         inet_twsk_put(inet_twsk(sk));
1737                         sk = sk2;
1738                         goto process;
1739                 }
1740                 /* Fall through to ACK */
1741         }
1742         case TCP_TW_ACK:
1743                 tcp_v4_timewait_ack(sk, skb);
1744                 break;
1745         case TCP_TW_RST:
1746                 goto no_tcp_socket;
1747         case TCP_TW_SUCCESS:;
1748         }
1749         goto discard_it;
1750 }
1751
1752 /* VJ's idea. Save last timestamp seen from this destination
1753  * and hold it at least for normal timewait interval to use for duplicate
1754  * segment detection in subsequent connections, before they enter synchronized
1755  * state.
1756  */
1757
1758 int tcp_v4_remember_stamp(struct sock *sk)
1759 {
1760         struct inet_sock *inet = inet_sk(sk);
1761         struct tcp_sock *tp = tcp_sk(sk);
1762         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1763         struct inet_peer *peer = NULL;
1764         int release_it = 0;
1765
1766         if (!rt || rt->rt_dst != inet->daddr) {
1767                 peer = inet_getpeer(inet->daddr, 1);
1768                 release_it = 1;
1769         } else {
1770                 if (!rt->peer)
1771                         rt_bind_peer(rt, 1);
1772                 peer = rt->peer;
1773         }
1774
1775         if (peer) {
1776                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1777                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1778                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1779                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1780                         peer->tcp_ts = tp->rx_opt.ts_recent;
1781                 }
1782                 if (release_it)
1783                         inet_putpeer(peer);
1784                 return 1;
1785         }
1786
1787         return 0;
1788 }
1789
1790 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1791 {
1792         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1793
1794         if (peer) {
1795                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1796
1797                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1798                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1799                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1800                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1801                         peer->tcp_ts       = tcptw->tw_ts_recent;
1802                 }
1803                 inet_putpeer(peer);
1804                 return 1;
1805         }
1806
1807         return 0;
1808 }
1809
1810 struct inet_connection_sock_af_ops ipv4_specific = {
1811         .queue_xmit        = ip_queue_xmit,
1812         .send_check        = tcp_v4_send_check,
1813         .rebuild_header    = inet_sk_rebuild_header,
1814         .conn_request      = tcp_v4_conn_request,
1815         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1816         .remember_stamp    = tcp_v4_remember_stamp,
1817         .net_header_len    = sizeof(struct iphdr),
1818         .setsockopt        = ip_setsockopt,
1819         .getsockopt        = ip_getsockopt,
1820         .addr2sockaddr     = inet_csk_addr2sockaddr,
1821         .sockaddr_len      = sizeof(struct sockaddr_in),
1822 #ifdef CONFIG_COMPAT
1823         .compat_setsockopt = compat_ip_setsockopt,
1824         .compat_getsockopt = compat_ip_getsockopt,
1825 #endif
1826 };
1827
1828 #ifdef CONFIG_TCP_MD5SIG
1829 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1830         .md5_lookup             = tcp_v4_md5_lookup,
1831         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1832         .md5_add                = tcp_v4_md5_add_func,
1833         .md5_parse              = tcp_v4_parse_md5_keys,
1834 };
1835 #endif
1836
1837 /* NOTE: A lot of things set to zero explicitly by call to
1838  *       sk_alloc() so need not be done here.
1839  */
1840 static int tcp_v4_init_sock(struct sock *sk)
1841 {
1842         struct inet_connection_sock *icsk = inet_csk(sk);
1843         struct tcp_sock *tp = tcp_sk(sk);
1844
1845         skb_queue_head_init(&tp->out_of_order_queue);
1846         tcp_init_xmit_timers(sk);
1847         tcp_prequeue_init(tp);
1848
1849         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1850         tp->mdev = TCP_TIMEOUT_INIT;
1851
1852         /* So many TCP implementations out there (incorrectly) count the
1853          * initial SYN frame in their delayed-ACK and congestion control
1854          * algorithms that we must have the following bandaid to talk
1855          * efficiently to them.  -DaveM
1856          */
1857         tp->snd_cwnd = 2;
1858
1859         /* See draft-stevens-tcpca-spec-01 for discussion of the
1860          * initialization of these values.
1861          */
1862         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1863         tp->snd_cwnd_clamp = ~0;
1864         tp->mss_cache = 536;
1865
1866         tp->reordering = sysctl_tcp_reordering;
1867         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1868
1869         sk->sk_state = TCP_CLOSE;
1870
1871         sk->sk_write_space = sk_stream_write_space;
1872         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1873
1874         icsk->icsk_af_ops = &ipv4_specific;
1875         icsk->icsk_sync_mss = tcp_sync_mss;
1876 #ifdef CONFIG_TCP_MD5SIG
1877         tp->af_specific = &tcp_sock_ipv4_specific;
1878 #endif
1879
1880         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1881         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1882
1883         atomic_inc(&tcp_sockets_allocated);
1884
1885         return 0;
1886 }
1887
1888 int tcp_v4_destroy_sock(struct sock *sk)
1889 {
1890         struct tcp_sock *tp = tcp_sk(sk);
1891
1892         tcp_clear_xmit_timers(sk);
1893
1894         tcp_cleanup_congestion_control(sk);
1895
1896         /* Cleanup up the write buffer. */
1897         tcp_write_queue_purge(sk);
1898
1899         /* Cleans up our, hopefully empty, out_of_order_queue. */
1900         __skb_queue_purge(&tp->out_of_order_queue);
1901
1902 #ifdef CONFIG_TCP_MD5SIG
1903         /* Clean up the MD5 key list, if any */
1904         if (tp->md5sig_info) {
1905                 tcp_v4_clear_md5_list(sk);
1906                 kfree(tp->md5sig_info);
1907                 tp->md5sig_info = NULL;
1908         }
1909 #endif
1910
1911 #ifdef CONFIG_NET_DMA
1912         /* Cleans up our sk_async_wait_queue */
1913         __skb_queue_purge(&sk->sk_async_wait_queue);
1914 #endif
1915
1916         /* Clean prequeue, it must be empty really */
1917         __skb_queue_purge(&tp->ucopy.prequeue);
1918
1919         /* Clean up a referenced TCP bind bucket. */
1920         if (inet_csk(sk)->icsk_bind_hash)
1921                 inet_put_port(&tcp_hashinfo, sk);
1922
1923         /*
1924          * If sendmsg cached page exists, toss it.
1925          */
1926         if (sk->sk_sndmsg_page) {
1927                 __free_page(sk->sk_sndmsg_page);
1928                 sk->sk_sndmsg_page = NULL;
1929         }
1930
1931         atomic_dec(&tcp_sockets_allocated);
1932
1933         return 0;
1934 }
1935
1936 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1937
1938 #ifdef CONFIG_PROC_FS
1939 /* Proc filesystem TCP sock list dumping. */
1940
1941 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1942 {
1943         return hlist_empty(head) ? NULL :
1944                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1945 }
1946
1947 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1948 {
1949         return tw->tw_node.next ?
1950                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1951 }
1952
1953 static void *listening_get_next(struct seq_file *seq, void *cur)
1954 {
1955         struct inet_connection_sock *icsk;
1956         struct hlist_node *node;
1957         struct sock *sk = cur;
1958         struct tcp_iter_state* st = seq->private;
1959
1960         if (!sk) {
1961                 st->bucket = 0;
1962                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1963                 goto get_sk;
1964         }
1965
1966         ++st->num;
1967
1968         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1969                 struct request_sock *req = cur;
1970
1971                 icsk = inet_csk(st->syn_wait_sk);
1972                 req = req->dl_next;
1973                 while (1) {
1974                         while (req) {
1975                                 if (req->rsk_ops->family == st->family) {
1976                                         cur = req;
1977                                         goto out;
1978                                 }
1979                                 req = req->dl_next;
1980                         }
1981                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1982                                 break;
1983 get_req:
1984                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1985                 }
1986                 sk        = sk_next(st->syn_wait_sk);
1987                 st->state = TCP_SEQ_STATE_LISTENING;
1988                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1989         } else {
1990                 icsk = inet_csk(sk);
1991                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1992                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1993                         goto start_req;
1994                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1995                 sk = sk_next(sk);
1996         }
1997 get_sk:
1998         sk_for_each_from(sk, node) {
1999                 if (sk->sk_family == st->family) {
2000                         cur = sk;
2001                         goto out;
2002                 }
2003                 icsk = inet_csk(sk);
2004                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2005                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2006 start_req:
2007                         st->uid         = sock_i_uid(sk);
2008                         st->syn_wait_sk = sk;
2009                         st->state       = TCP_SEQ_STATE_OPENREQ;
2010                         st->sbucket     = 0;
2011                         goto get_req;
2012                 }
2013                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2014         }
2015         if (++st->bucket < INET_LHTABLE_SIZE) {
2016                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2017                 goto get_sk;
2018         }
2019         cur = NULL;
2020 out:
2021         return cur;
2022 }
2023
2024 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2025 {
2026         void *rc = listening_get_next(seq, NULL);
2027
2028         while (rc && *pos) {
2029                 rc = listening_get_next(seq, rc);
2030                 --*pos;
2031         }
2032         return rc;
2033 }
2034
2035 static void *established_get_first(struct seq_file *seq)
2036 {
2037         struct tcp_iter_state* st = seq->private;
2038         void *rc = NULL;
2039
2040         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2041                 struct sock *sk;
2042                 struct hlist_node *node;
2043                 struct inet_timewait_sock *tw;
2044
2045                 /* We can reschedule _before_ having picked the target: */
2046                 cond_resched_softirq();
2047
2048                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2049                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2050                         if (sk->sk_family != st->family) {
2051                                 continue;
2052                         }
2053                         rc = sk;
2054                         goto out;
2055                 }
2056                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2057                 inet_twsk_for_each(tw, node,
2058                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2059                         if (tw->tw_family != st->family) {
2060                                 continue;
2061                         }
2062                         rc = tw;
2063                         goto out;
2064                 }
2065                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2066                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2067         }
2068 out:
2069         return rc;
2070 }
2071
2072 static void *established_get_next(struct seq_file *seq, void *cur)
2073 {
2074         struct sock *sk = cur;
2075         struct inet_timewait_sock *tw;
2076         struct hlist_node *node;
2077         struct tcp_iter_state* st = seq->private;
2078
2079         ++st->num;
2080
2081         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2082                 tw = cur;
2083                 tw = tw_next(tw);
2084 get_tw:
2085                 while (tw && tw->tw_family != st->family) {
2086                         tw = tw_next(tw);
2087                 }
2088                 if (tw) {
2089                         cur = tw;
2090                         goto out;
2091                 }
2092                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2093                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2094
2095                 /* We can reschedule between buckets: */
2096                 cond_resched_softirq();
2097
2098                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2099                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
2100                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2101                 } else {
2102                         cur = NULL;
2103                         goto out;
2104                 }
2105         } else
2106                 sk = sk_next(sk);
2107
2108         sk_for_each_from(sk, node) {
2109                 if (sk->sk_family == st->family)
2110                         goto found;
2111         }
2112
2113         st->state = TCP_SEQ_STATE_TIME_WAIT;
2114         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2115         goto get_tw;
2116 found:
2117         cur = sk;
2118 out:
2119         return cur;
2120 }
2121
2122 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2123 {
2124         void *rc = established_get_first(seq);
2125
2126         while (rc && pos) {
2127                 rc = established_get_next(seq, rc);
2128                 --pos;
2129         }
2130         return rc;
2131 }
2132
2133 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2134 {
2135         void *rc;
2136         struct tcp_iter_state* st = seq->private;
2137
2138         inet_listen_lock(&tcp_hashinfo);
2139         st->state = TCP_SEQ_STATE_LISTENING;
2140         rc        = listening_get_idx(seq, &pos);
2141
2142         if (!rc) {
2143                 inet_listen_unlock(&tcp_hashinfo);
2144                 local_bh_disable();
2145                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2146                 rc        = established_get_idx(seq, pos);
2147         }
2148
2149         return rc;
2150 }
2151
2152 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2153 {
2154         struct tcp_iter_state* st = seq->private;
2155         st->state = TCP_SEQ_STATE_LISTENING;
2156         st->num = 0;
2157         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2158 }
2159
2160 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2161 {
2162         void *rc = NULL;
2163         struct tcp_iter_state* st;
2164
2165         if (v == SEQ_START_TOKEN) {
2166                 rc = tcp_get_idx(seq, 0);
2167                 goto out;
2168         }
2169         st = seq->private;
2170
2171         switch (st->state) {
2172         case TCP_SEQ_STATE_OPENREQ:
2173         case TCP_SEQ_STATE_LISTENING:
2174                 rc = listening_get_next(seq, v);
2175                 if (!rc) {
2176                         inet_listen_unlock(&tcp_hashinfo);
2177                         local_bh_disable();
2178                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2179                         rc        = established_get_first(seq);
2180                 }
2181                 break;
2182         case TCP_SEQ_STATE_ESTABLISHED:
2183         case TCP_SEQ_STATE_TIME_WAIT:
2184                 rc = established_get_next(seq, v);
2185                 break;
2186         }
2187 out:
2188         ++*pos;
2189         return rc;
2190 }
2191
2192 static void tcp_seq_stop(struct seq_file *seq, void *v)
2193 {
2194         struct tcp_iter_state* st = seq->private;
2195
2196         switch (st->state) {
2197         case TCP_SEQ_STATE_OPENREQ:
2198                 if (v) {
2199                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2200                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2201                 }
2202         case TCP_SEQ_STATE_LISTENING:
2203                 if (v != SEQ_START_TOKEN)
2204                         inet_listen_unlock(&tcp_hashinfo);
2205                 break;
2206         case TCP_SEQ_STATE_TIME_WAIT:
2207         case TCP_SEQ_STATE_ESTABLISHED:
2208                 if (v)
2209                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2210                 local_bh_enable();
2211                 break;
2212         }
2213 }
2214
2215 static int tcp_seq_open(struct inode *inode, struct file *file)
2216 {
2217         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2218         struct seq_file *seq;
2219         struct tcp_iter_state *s;
2220         int rc;
2221
2222         if (unlikely(afinfo == NULL))
2223                 return -EINVAL;
2224
2225         s = kzalloc(sizeof(*s), GFP_KERNEL);
2226         if (!s)
2227                 return -ENOMEM;
2228         s->family               = afinfo->family;
2229         s->seq_ops.start        = tcp_seq_start;
2230         s->seq_ops.next         = tcp_seq_next;
2231         s->seq_ops.show         = afinfo->seq_show;
2232         s->seq_ops.stop         = tcp_seq_stop;
2233
2234         rc = seq_open(file, &s->seq_ops);
2235         if (rc)
2236                 goto out_kfree;
2237         seq          = file->private_data;
2238         seq->private = s;
2239 out:
2240         return rc;
2241 out_kfree:
2242         kfree(s);
2243         goto out;
2244 }
2245
2246 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2247 {
2248         int rc = 0;
2249         struct proc_dir_entry *p;
2250
2251         if (!afinfo)
2252                 return -EINVAL;
2253         afinfo->seq_fops->owner         = afinfo->owner;
2254         afinfo->seq_fops->open          = tcp_seq_open;
2255         afinfo->seq_fops->read          = seq_read;
2256         afinfo->seq_fops->llseek        = seq_lseek;
2257         afinfo->seq_fops->release       = seq_release_private;
2258
2259         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2260         if (p)
2261                 p->data = afinfo;
2262         else
2263                 rc = -ENOMEM;
2264         return rc;
2265 }
2266
2267 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2268 {
2269         if (!afinfo)
2270                 return;
2271         proc_net_remove(afinfo->name);
2272         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2273 }
2274
2275 static void get_openreq4(struct sock *sk, struct request_sock *req,
2276                          char *tmpbuf, int i, int uid)
2277 {
2278         const struct inet_request_sock *ireq = inet_rsk(req);
2279         int ttd = req->expires - jiffies;
2280
2281         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2282                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2283                 i,
2284                 ireq->loc_addr,
2285                 ntohs(inet_sk(sk)->sport),
2286                 ireq->rmt_addr,
2287                 ntohs(ireq->rmt_port),
2288                 TCP_SYN_RECV,
2289                 0, 0, /* could print option size, but that is af dependent. */
2290                 1,    /* timers active (only the expire timer) */
2291                 jiffies_to_clock_t(ttd),
2292                 req->retrans,
2293                 uid,
2294                 0,  /* non standard timer */
2295                 0, /* open_requests have no inode */
2296                 atomic_read(&sk->sk_refcnt),
2297                 req);
2298 }
2299
2300 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2301 {
2302         int timer_active;
2303         unsigned long timer_expires;
2304         struct tcp_sock *tp = tcp_sk(sk);
2305         const struct inet_connection_sock *icsk = inet_csk(sk);
2306         struct inet_sock *inet = inet_sk(sk);
2307         __be32 dest = inet->daddr;
2308         __be32 src = inet->rcv_saddr;
2309         __u16 destp = ntohs(inet->dport);
2310         __u16 srcp = ntohs(inet->sport);
2311
2312         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2313                 timer_active    = 1;
2314                 timer_expires   = icsk->icsk_timeout;
2315         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2316                 timer_active    = 4;
2317                 timer_expires   = icsk->icsk_timeout;
2318         } else if (timer_pending(&sk->sk_timer)) {
2319                 timer_active    = 2;
2320                 timer_expires   = sk->sk_timer.expires;
2321         } else {
2322                 timer_active    = 0;
2323                 timer_expires = jiffies;
2324         }
2325
2326         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2327                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2328                 i, src, srcp, dest, destp, sk->sk_state,
2329                 tp->write_seq - tp->snd_una,
2330                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2331                                              (tp->rcv_nxt - tp->copied_seq),
2332                 timer_active,
2333                 jiffies_to_clock_t(timer_expires - jiffies),
2334                 icsk->icsk_retransmits,
2335                 sock_i_uid(sk),
2336                 icsk->icsk_probes_out,
2337                 sock_i_ino(sk),
2338                 atomic_read(&sk->sk_refcnt), sk,
2339                 icsk->icsk_rto,
2340                 icsk->icsk_ack.ato,
2341                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2342                 tp->snd_cwnd,
2343                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2344 }
2345
2346 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2347                                char *tmpbuf, int i)
2348 {
2349         __be32 dest, src;
2350         __u16 destp, srcp;
2351         int ttd = tw->tw_ttd - jiffies;
2352
2353         if (ttd < 0)
2354                 ttd = 0;
2355
2356         dest  = tw->tw_daddr;
2357         src   = tw->tw_rcv_saddr;
2358         destp = ntohs(tw->tw_dport);
2359         srcp  = ntohs(tw->tw_sport);
2360
2361         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2362                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2363                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2364                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2365                 atomic_read(&tw->tw_refcnt), tw);
2366 }
2367
2368 #define TMPSZ 150
2369
2370 static int tcp4_seq_show(struct seq_file *seq, void *v)
2371 {
2372         struct tcp_iter_state* st;
2373         char tmpbuf[TMPSZ + 1];
2374
2375         if (v == SEQ_START_TOKEN) {
2376                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2377                            "  sl  local_address rem_address   st tx_queue "
2378                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2379                            "inode");
2380                 goto out;
2381         }
2382         st = seq->private;
2383
2384         switch (st->state) {
2385         case TCP_SEQ_STATE_LISTENING:
2386         case TCP_SEQ_STATE_ESTABLISHED:
2387                 get_tcp4_sock(v, tmpbuf, st->num);
2388                 break;
2389         case TCP_SEQ_STATE_OPENREQ:
2390                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2391                 break;
2392         case TCP_SEQ_STATE_TIME_WAIT:
2393                 get_timewait4_sock(v, tmpbuf, st->num);
2394                 break;
2395         }
2396         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2397 out:
2398         return 0;
2399 }
2400
2401 static struct file_operations tcp4_seq_fops;
2402 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2403         .owner          = THIS_MODULE,
2404         .name           = "tcp",
2405         .family         = AF_INET,
2406         .seq_show       = tcp4_seq_show,
2407         .seq_fops       = &tcp4_seq_fops,
2408 };
2409
2410 int __init tcp4_proc_init(void)
2411 {
2412         return tcp_proc_register(&tcp4_seq_afinfo);
2413 }
2414
2415 void tcp4_proc_exit(void)
2416 {
2417         tcp_proc_unregister(&tcp4_seq_afinfo);
2418 }
2419 #endif /* CONFIG_PROC_FS */
2420
2421 struct proto tcp_prot = {
2422         .name                   = "TCP",
2423         .owner                  = THIS_MODULE,
2424         .close                  = tcp_close,
2425         .connect                = tcp_v4_connect,
2426         .disconnect             = tcp_disconnect,
2427         .accept                 = inet_csk_accept,
2428         .ioctl                  = tcp_ioctl,
2429         .init                   = tcp_v4_init_sock,
2430         .destroy                = tcp_v4_destroy_sock,
2431         .shutdown               = tcp_shutdown,
2432         .setsockopt             = tcp_setsockopt,
2433         .getsockopt             = tcp_getsockopt,
2434         .sendmsg                = tcp_sendmsg,
2435         .recvmsg                = tcp_recvmsg,
2436         .backlog_rcv            = tcp_v4_do_rcv,
2437         .hash                   = tcp_v4_hash,
2438         .unhash                 = tcp_unhash,
2439         .get_port               = tcp_v4_get_port,
2440         .enter_memory_pressure  = tcp_enter_memory_pressure,
2441         .sockets_allocated      = &tcp_sockets_allocated,
2442         .orphan_count           = &tcp_orphan_count,
2443         .memory_allocated       = &tcp_memory_allocated,
2444         .memory_pressure        = &tcp_memory_pressure,
2445         .sysctl_mem             = sysctl_tcp_mem,
2446         .sysctl_wmem            = sysctl_tcp_wmem,
2447         .sysctl_rmem            = sysctl_tcp_rmem,
2448         .max_header             = MAX_TCP_HEADER,
2449         .obj_size               = sizeof(struct tcp_sock),
2450         .twsk_prot              = &tcp_timewait_sock_ops,
2451         .rsk_prot               = &tcp_request_sock_ops,
2452 #ifdef CONFIG_COMPAT
2453         .compat_setsockopt      = compat_tcp_setsockopt,
2454         .compat_getsockopt      = compat_tcp_getsockopt,
2455 #endif
2456 };
2457
2458 void __init tcp_v4_init(struct net_proto_family *ops)
2459 {
2460         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2461                                      IPPROTO_TCP) < 0)
2462                 panic("Failed to create the TCP control socket.\n");
2463 }
2464
2465 EXPORT_SYMBOL(ipv4_specific);
2466 EXPORT_SYMBOL(tcp_hashinfo);
2467 EXPORT_SYMBOL(tcp_prot);
2468 EXPORT_SYMBOL(tcp_unhash);
2469 EXPORT_SYMBOL(tcp_v4_conn_request);
2470 EXPORT_SYMBOL(tcp_v4_connect);
2471 EXPORT_SYMBOL(tcp_v4_do_rcv);
2472 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2473 EXPORT_SYMBOL(tcp_v4_send_check);
2474 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2475
2476 #ifdef CONFIG_PROC_FS
2477 EXPORT_SYMBOL(tcp_proc_register);
2478 EXPORT_SYMBOL(tcp_proc_unregister);
2479 #endif
2480 EXPORT_SYMBOL(sysctl_local_port_range);
2481 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2482