]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/netfilter/ipvs/ip_vs_proto_tcp.c
xfrm: dst_entries_init() per-net dst_ops
[karo-tx-linux.git] / net / netfilter / ipvs / ip_vs_proto_tcp.c
1 /*
2  * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
3  *
4  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
5  *              Julian Anastasov <ja@ssi.bg>
6  *
7  *              This program is free software; you can redistribute it and/or
8  *              modify it under the terms of the GNU General Public License
9  *              as published by the Free Software Foundation; either version
10  *              2 of the License, or (at your option) any later version.
11  *
12  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
13  *
14  *              Network name space (netns) aware.
15  *              Global data moved to netns i.e struct netns_ipvs
16  *              tcp_timeouts table has copy per netns in a hash table per
17  *              protocol ip_vs_proto_data and is handled by netns
18  */
19
20 #define KMSG_COMPONENT "IPVS"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
23 #include <linux/kernel.h>
24 #include <linux/ip.h>
25 #include <linux/tcp.h>                  /* for tcphdr */
26 #include <net/ip.h>
27 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
28 #include <net/ip6_checksum.h>
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
31
32 #include <net/ip_vs.h>
33
34 static int
35 tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
36                   int *verdict, struct ip_vs_conn **cpp,
37                   struct ip_vs_iphdr *iph)
38 {
39         struct net *net;
40         struct ip_vs_service *svc;
41         struct tcphdr _tcph, *th;
42         struct netns_ipvs *ipvs;
43
44         th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
45         if (th == NULL) {
46                 *verdict = NF_DROP;
47                 return 0;
48         }
49         net = skb_net(skb);
50         ipvs = net_ipvs(net);
51         /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
52         rcu_read_lock();
53         if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
54             (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
55                                       &iph->daddr, th->dest))) {
56                 int ignored;
57
58                 if (ip_vs_todrop(ipvs)) {
59                         /*
60                          * It seems that we are very loaded.
61                          * We have to drop this packet :(
62                          */
63                         rcu_read_unlock();
64                         *verdict = NF_DROP;
65                         return 0;
66                 }
67
68                 /*
69                  * Let the virtual server select a real server for the
70                  * incoming connection, and create a connection entry.
71                  */
72                 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
73                 if (!*cpp && ignored <= 0) {
74                         if (!ignored)
75                                 *verdict = ip_vs_leave(svc, skb, pd, iph);
76                         else
77                                 *verdict = NF_DROP;
78                         rcu_read_unlock();
79                         return 0;
80                 }
81         }
82         rcu_read_unlock();
83         /* NF_ACCEPT */
84         return 1;
85 }
86
87
88 static inline void
89 tcp_fast_csum_update(int af, struct tcphdr *tcph,
90                      const union nf_inet_addr *oldip,
91                      const union nf_inet_addr *newip,
92                      __be16 oldport, __be16 newport)
93 {
94 #ifdef CONFIG_IP_VS_IPV6
95         if (af == AF_INET6)
96                 tcph->check =
97                         csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
98                                          ip_vs_check_diff2(oldport, newport,
99                                                 ~csum_unfold(tcph->check))));
100         else
101 #endif
102         tcph->check =
103                 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
104                                  ip_vs_check_diff2(oldport, newport,
105                                                 ~csum_unfold(tcph->check))));
106 }
107
108
109 static inline void
110 tcp_partial_csum_update(int af, struct tcphdr *tcph,
111                      const union nf_inet_addr *oldip,
112                      const union nf_inet_addr *newip,
113                      __be16 oldlen, __be16 newlen)
114 {
115 #ifdef CONFIG_IP_VS_IPV6
116         if (af == AF_INET6)
117                 tcph->check =
118                         ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
119                                          ip_vs_check_diff2(oldlen, newlen,
120                                                 csum_unfold(tcph->check))));
121         else
122 #endif
123         tcph->check =
124                 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
125                                 ip_vs_check_diff2(oldlen, newlen,
126                                                 csum_unfold(tcph->check))));
127 }
128
129
130 static int
131 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
132                  struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
133 {
134         struct tcphdr *tcph;
135         unsigned int tcphoff = iph->len;
136         int oldlen;
137         int payload_csum = 0;
138
139 #ifdef CONFIG_IP_VS_IPV6
140         if (cp->af == AF_INET6 && iph->fragoffs)
141                 return 1;
142 #endif
143         oldlen = skb->len - tcphoff;
144
145         /* csum_check requires unshared skb */
146         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
147                 return 0;
148
149         if (unlikely(cp->app != NULL)) {
150                 int ret;
151
152                 /* Some checks before mangling */
153                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
154                         return 0;
155
156                 /* Call application helper if needed */
157                 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
158                         return 0;
159                 /* ret=2: csum update is needed after payload mangling */
160                 if (ret == 1)
161                         oldlen = skb->len - tcphoff;
162                 else
163                         payload_csum = 1;
164         }
165
166         tcph = (void *)skb_network_header(skb) + tcphoff;
167         tcph->source = cp->vport;
168
169         /* Adjust TCP checksums */
170         if (skb->ip_summed == CHECKSUM_PARTIAL) {
171                 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
172                                         htons(oldlen),
173                                         htons(skb->len - tcphoff));
174         } else if (!payload_csum) {
175                 /* Only port and addr are changed, do fast csum update */
176                 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
177                                      cp->dport, cp->vport);
178                 if (skb->ip_summed == CHECKSUM_COMPLETE)
179                         skb->ip_summed = (cp->app && pp->csum_check) ?
180                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
181         } else {
182                 /* full checksum calculation */
183                 tcph->check = 0;
184                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
185 #ifdef CONFIG_IP_VS_IPV6
186                 if (cp->af == AF_INET6)
187                         tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
188                                                       &cp->caddr.in6,
189                                                       skb->len - tcphoff,
190                                                       cp->protocol, skb->csum);
191                 else
192 #endif
193                         tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
194                                                         cp->caddr.ip,
195                                                         skb->len - tcphoff,
196                                                         cp->protocol,
197                                                         skb->csum);
198                 skb->ip_summed = CHECKSUM_UNNECESSARY;
199
200                 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
201                           pp->name, tcph->check,
202                           (char*)&(tcph->check) - (char*)tcph);
203         }
204         return 1;
205 }
206
207
208 static int
209 tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
210                  struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
211 {
212         struct tcphdr *tcph;
213         unsigned int tcphoff = iph->len;
214         int oldlen;
215         int payload_csum = 0;
216
217 #ifdef CONFIG_IP_VS_IPV6
218         if (cp->af == AF_INET6 && iph->fragoffs)
219                 return 1;
220 #endif
221         oldlen = skb->len - tcphoff;
222
223         /* csum_check requires unshared skb */
224         if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
225                 return 0;
226
227         if (unlikely(cp->app != NULL)) {
228                 int ret;
229
230                 /* Some checks before mangling */
231                 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
232                         return 0;
233
234                 /*
235                  *      Attempt ip_vs_app call.
236                  *      It will fix ip_vs_conn and iph ack_seq stuff
237                  */
238                 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
239                         return 0;
240                 /* ret=2: csum update is needed after payload mangling */
241                 if (ret == 1)
242                         oldlen = skb->len - tcphoff;
243                 else
244                         payload_csum = 1;
245         }
246
247         tcph = (void *)skb_network_header(skb) + tcphoff;
248         tcph->dest = cp->dport;
249
250         /*
251          *      Adjust TCP checksums
252          */
253         if (skb->ip_summed == CHECKSUM_PARTIAL) {
254                 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
255                                         htons(oldlen),
256                                         htons(skb->len - tcphoff));
257         } else if (!payload_csum) {
258                 /* Only port and addr are changed, do fast csum update */
259                 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
260                                      cp->vport, cp->dport);
261                 if (skb->ip_summed == CHECKSUM_COMPLETE)
262                         skb->ip_summed = (cp->app && pp->csum_check) ?
263                                          CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
264         } else {
265                 /* full checksum calculation */
266                 tcph->check = 0;
267                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
268 #ifdef CONFIG_IP_VS_IPV6
269                 if (cp->af == AF_INET6)
270                         tcph->check = csum_ipv6_magic(&cp->caddr.in6,
271                                                       &cp->daddr.in6,
272                                                       skb->len - tcphoff,
273                                                       cp->protocol, skb->csum);
274                 else
275 #endif
276                         tcph->check = csum_tcpudp_magic(cp->caddr.ip,
277                                                         cp->daddr.ip,
278                                                         skb->len - tcphoff,
279                                                         cp->protocol,
280                                                         skb->csum);
281                 skb->ip_summed = CHECKSUM_UNNECESSARY;
282         }
283         return 1;
284 }
285
286
287 static int
288 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
289 {
290         unsigned int tcphoff;
291
292 #ifdef CONFIG_IP_VS_IPV6
293         if (af == AF_INET6)
294                 tcphoff = sizeof(struct ipv6hdr);
295         else
296 #endif
297                 tcphoff = ip_hdrlen(skb);
298
299         switch (skb->ip_summed) {
300         case CHECKSUM_NONE:
301                 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
302         case CHECKSUM_COMPLETE:
303 #ifdef CONFIG_IP_VS_IPV6
304                 if (af == AF_INET6) {
305                         if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
306                                             &ipv6_hdr(skb)->daddr,
307                                             skb->len - tcphoff,
308                                             ipv6_hdr(skb)->nexthdr,
309                                             skb->csum)) {
310                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
311                                                  "Failed checksum for");
312                                 return 0;
313                         }
314                 } else
315 #endif
316                         if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
317                                               ip_hdr(skb)->daddr,
318                                               skb->len - tcphoff,
319                                               ip_hdr(skb)->protocol,
320                                               skb->csum)) {
321                                 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
322                                                  "Failed checksum for");
323                                 return 0;
324                         }
325                 break;
326         default:
327                 /* No need to checksum. */
328                 break;
329         }
330
331         return 1;
332 }
333
334
335 #define TCP_DIR_INPUT           0
336 #define TCP_DIR_OUTPUT          4
337 #define TCP_DIR_INPUT_ONLY      8
338
339 static const int tcp_state_off[IP_VS_DIR_LAST] = {
340         [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
341         [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
342         [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
343 };
344
345 /*
346  *      Timeout table[state]
347  */
348 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
349         [IP_VS_TCP_S_NONE]              =       2*HZ,
350         [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
351         [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
352         [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
353         [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
354         [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
355         [IP_VS_TCP_S_CLOSE]             =       10*HZ,
356         [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
357         [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
358         [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
359         [IP_VS_TCP_S_SYNACK]            =       120*HZ,
360         [IP_VS_TCP_S_LAST]              =       2*HZ,
361 };
362
363 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
364         [IP_VS_TCP_S_NONE]              =       "NONE",
365         [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
366         [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
367         [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
368         [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
369         [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
370         [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
371         [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
372         [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
373         [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
374         [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
375         [IP_VS_TCP_S_LAST]              =       "BUG!",
376 };
377
378 #define sNO IP_VS_TCP_S_NONE
379 #define sES IP_VS_TCP_S_ESTABLISHED
380 #define sSS IP_VS_TCP_S_SYN_SENT
381 #define sSR IP_VS_TCP_S_SYN_RECV
382 #define sFW IP_VS_TCP_S_FIN_WAIT
383 #define sTW IP_VS_TCP_S_TIME_WAIT
384 #define sCL IP_VS_TCP_S_CLOSE
385 #define sCW IP_VS_TCP_S_CLOSE_WAIT
386 #define sLA IP_VS_TCP_S_LAST_ACK
387 #define sLI IP_VS_TCP_S_LISTEN
388 #define sSA IP_VS_TCP_S_SYNACK
389
390 struct tcp_states_t {
391         int next_state[IP_VS_TCP_S_LAST];
392 };
393
394 static const char * tcp_state_name(int state)
395 {
396         if (state >= IP_VS_TCP_S_LAST)
397                 return "ERR!";
398         return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
399 }
400
401 static struct tcp_states_t tcp_states [] = {
402 /*      INPUT */
403 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
404 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
405 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
406 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
407 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
408
409 /*      OUTPUT */
410 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
411 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
412 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
413 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
414 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
415
416 /*      INPUT-ONLY */
417 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
418 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
419 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
420 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
421 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
422 };
423
424 static struct tcp_states_t tcp_states_dos [] = {
425 /*      INPUT */
426 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
427 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
428 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
429 /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
430 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
431
432 /*      OUTPUT */
433 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
434 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
435 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
436 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
437 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
438
439 /*      INPUT-ONLY */
440 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
441 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
442 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
443 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
444 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
445 };
446
447 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
448 {
449         int on = (flags & 1);           /* secure_tcp */
450
451         /*
452         ** FIXME: change secure_tcp to independent sysctl var
453         ** or make it per-service or per-app because it is valid
454         ** for most if not for all of the applications. Something
455         ** like "capabilities" (flags) for each object.
456         */
457         pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
458 }
459
460 static inline int tcp_state_idx(struct tcphdr *th)
461 {
462         if (th->rst)
463                 return 3;
464         if (th->syn)
465                 return 0;
466         if (th->fin)
467                 return 1;
468         if (th->ack)
469                 return 2;
470         return -1;
471 }
472
473 static inline void
474 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
475               int direction, struct tcphdr *th)
476 {
477         int state_idx;
478         int new_state = IP_VS_TCP_S_CLOSE;
479         int state_off = tcp_state_off[direction];
480
481         /*
482          *    Update state offset to INPUT_ONLY if necessary
483          *    or delete NO_OUTPUT flag if output packet detected
484          */
485         if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
486                 if (state_off == TCP_DIR_OUTPUT)
487                         cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
488                 else
489                         state_off = TCP_DIR_INPUT_ONLY;
490         }
491
492         if ((state_idx = tcp_state_idx(th)) < 0) {
493                 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
494                 goto tcp_state_out;
495         }
496
497         new_state =
498                 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
499
500   tcp_state_out:
501         if (new_state != cp->state) {
502                 struct ip_vs_dest *dest = cp->dest;
503
504                 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
505                               "%s:%d state: %s->%s conn->refcnt:%d\n",
506                               pd->pp->name,
507                               ((state_off == TCP_DIR_OUTPUT) ?
508                                "output " : "input "),
509                               th->syn ? 'S' : '.',
510                               th->fin ? 'F' : '.',
511                               th->ack ? 'A' : '.',
512                               th->rst ? 'R' : '.',
513                               IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
514                               ntohs(cp->dport),
515                               IP_VS_DBG_ADDR(cp->af, &cp->caddr),
516                               ntohs(cp->cport),
517                               tcp_state_name(cp->state),
518                               tcp_state_name(new_state),
519                               atomic_read(&cp->refcnt));
520
521                 if (dest) {
522                         if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
523                             (new_state != IP_VS_TCP_S_ESTABLISHED)) {
524                                 atomic_dec(&dest->activeconns);
525                                 atomic_inc(&dest->inactconns);
526                                 cp->flags |= IP_VS_CONN_F_INACTIVE;
527                         } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
528                                    (new_state == IP_VS_TCP_S_ESTABLISHED)) {
529                                 atomic_inc(&dest->activeconns);
530                                 atomic_dec(&dest->inactconns);
531                                 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
532                         }
533                 }
534         }
535
536         if (likely(pd))
537                 cp->timeout = pd->timeout_table[cp->state = new_state];
538         else    /* What to do ? */
539                 cp->timeout = tcp_timeouts[cp->state = new_state];
540 }
541
542 /*
543  *      Handle state transitions
544  */
545 static void
546 tcp_state_transition(struct ip_vs_conn *cp, int direction,
547                      const struct sk_buff *skb,
548                      struct ip_vs_proto_data *pd)
549 {
550         struct tcphdr _tcph, *th;
551
552 #ifdef CONFIG_IP_VS_IPV6
553         int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
554 #else
555         int ihl = ip_hdrlen(skb);
556 #endif
557
558         th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
559         if (th == NULL)
560                 return;
561
562         spin_lock_bh(&cp->lock);
563         set_tcp_state(pd, cp, direction, th);
564         spin_unlock_bh(&cp->lock);
565 }
566
567 static inline __u16 tcp_app_hashkey(__be16 port)
568 {
569         return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
570                 & TCP_APP_TAB_MASK;
571 }
572
573
574 static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
575 {
576         struct ip_vs_app *i;
577         __u16 hash;
578         __be16 port = inc->port;
579         int ret = 0;
580         struct netns_ipvs *ipvs = net_ipvs(net);
581         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
582
583         hash = tcp_app_hashkey(port);
584
585         list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
586                 if (i->port == port) {
587                         ret = -EEXIST;
588                         goto out;
589                 }
590         }
591         list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
592         atomic_inc(&pd->appcnt);
593
594   out:
595         return ret;
596 }
597
598
599 static void
600 tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
601 {
602         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
603
604         atomic_dec(&pd->appcnt);
605         list_del_rcu(&inc->p_list);
606 }
607
608
609 static int
610 tcp_app_conn_bind(struct ip_vs_conn *cp)
611 {
612         struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
613         int hash;
614         struct ip_vs_app *inc;
615         int result = 0;
616
617         /* Default binding: bind app only for NAT */
618         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
619                 return 0;
620
621         /* Lookup application incarnations and bind the right one */
622         hash = tcp_app_hashkey(cp->vport);
623
624         rcu_read_lock();
625         list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
626                 if (inc->port == cp->vport) {
627                         if (unlikely(!ip_vs_app_inc_get(inc)))
628                                 break;
629                         rcu_read_unlock();
630
631                         IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
632                                       "%s:%u to app %s on port %u\n",
633                                       __func__,
634                                       IP_VS_DBG_ADDR(cp->af, &cp->caddr),
635                                       ntohs(cp->cport),
636                                       IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
637                                       ntohs(cp->vport),
638                                       inc->name, ntohs(inc->port));
639
640                         cp->app = inc;
641                         if (inc->init_conn)
642                                 result = inc->init_conn(inc, cp);
643                         goto out;
644                 }
645         }
646         rcu_read_unlock();
647
648   out:
649         return result;
650 }
651
652
653 /*
654  *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
655  */
656 void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
657 {
658         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
659
660         spin_lock_bh(&cp->lock);
661         cp->state = IP_VS_TCP_S_LISTEN;
662         cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
663                            : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
664         spin_unlock_bh(&cp->lock);
665 }
666
667 /* ---------------------------------------------
668  *   timeouts is netns related now.
669  * ---------------------------------------------
670  */
671 static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
672 {
673         struct netns_ipvs *ipvs = net_ipvs(net);
674
675         ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
676         pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
677                                                         sizeof(tcp_timeouts));
678         if (!pd->timeout_table)
679                 return -ENOMEM;
680         pd->tcp_state_table =  tcp_states;
681         return 0;
682 }
683
684 static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
685 {
686         kfree(pd->timeout_table);
687 }
688
689
690 struct ip_vs_protocol ip_vs_protocol_tcp = {
691         .name =                 "TCP",
692         .protocol =             IPPROTO_TCP,
693         .num_states =           IP_VS_TCP_S_LAST,
694         .dont_defrag =          0,
695         .init =                 NULL,
696         .exit =                 NULL,
697         .init_netns =           __ip_vs_tcp_init,
698         .exit_netns =           __ip_vs_tcp_exit,
699         .register_app =         tcp_register_app,
700         .unregister_app =       tcp_unregister_app,
701         .conn_schedule =        tcp_conn_schedule,
702         .conn_in_get =          ip_vs_conn_in_get_proto,
703         .conn_out_get =         ip_vs_conn_out_get_proto,
704         .snat_handler =         tcp_snat_handler,
705         .dnat_handler =         tcp_dnat_handler,
706         .csum_check =           tcp_csum_check,
707         .state_name =           tcp_state_name,
708         .state_transition =     tcp_state_transition,
709         .app_conn_bind =        tcp_app_conn_bind,
710         .debug_packet =         ip_vs_tcpudp_debug_packet,
711         .timeout_change =       tcp_timeout_change,
712 };