]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - net/netfilter/ipvs/ip_vs_ctl.c
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6
[karo-tx-linux.git] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72 #ifdef CONFIG_IP_VS_IPV6
73 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
74 static int __ip_vs_addr_is_local_v6(struct net *net,
75                                     const struct in6_addr *addr)
76 {
77         struct rt6_info *rt;
78         struct flowi6 fl6 = {
79                 .daddr = *addr,
80         };
81
82         rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
83         if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
84                 return 1;
85
86         return 0;
87 }
88 #endif
89
90 #ifdef CONFIG_SYSCTL
91 /*
92  *      update_defense_level is called from keventd and from sysctl,
93  *      so it needs to protect itself from softirqs
94  */
95 static void update_defense_level(struct netns_ipvs *ipvs)
96 {
97         struct sysinfo i;
98         static int old_secure_tcp = 0;
99         int availmem;
100         int nomem;
101         int to_change = -1;
102
103         /* we only count free and buffered memory (in pages) */
104         si_meminfo(&i);
105         availmem = i.freeram + i.bufferram;
106         /* however in linux 2.5 the i.bufferram is total page cache size,
107            we need adjust it */
108         /* si_swapinfo(&i); */
109         /* availmem = availmem - (i.totalswap - i.freeswap); */
110
111         nomem = (availmem < ipvs->sysctl_amemthresh);
112
113         local_bh_disable();
114
115         /* drop_entry */
116         spin_lock(&ipvs->dropentry_lock);
117         switch (ipvs->sysctl_drop_entry) {
118         case 0:
119                 atomic_set(&ipvs->dropentry, 0);
120                 break;
121         case 1:
122                 if (nomem) {
123                         atomic_set(&ipvs->dropentry, 1);
124                         ipvs->sysctl_drop_entry = 2;
125                 } else {
126                         atomic_set(&ipvs->dropentry, 0);
127                 }
128                 break;
129         case 2:
130                 if (nomem) {
131                         atomic_set(&ipvs->dropentry, 1);
132                 } else {
133                         atomic_set(&ipvs->dropentry, 0);
134                         ipvs->sysctl_drop_entry = 1;
135                 };
136                 break;
137         case 3:
138                 atomic_set(&ipvs->dropentry, 1);
139                 break;
140         }
141         spin_unlock(&ipvs->dropentry_lock);
142
143         /* drop_packet */
144         spin_lock(&ipvs->droppacket_lock);
145         switch (ipvs->sysctl_drop_packet) {
146         case 0:
147                 ipvs->drop_rate = 0;
148                 break;
149         case 1:
150                 if (nomem) {
151                         ipvs->drop_rate = ipvs->drop_counter
152                                 = ipvs->sysctl_amemthresh /
153                                 (ipvs->sysctl_amemthresh-availmem);
154                         ipvs->sysctl_drop_packet = 2;
155                 } else {
156                         ipvs->drop_rate = 0;
157                 }
158                 break;
159         case 2:
160                 if (nomem) {
161                         ipvs->drop_rate = ipvs->drop_counter
162                                 = ipvs->sysctl_amemthresh /
163                                 (ipvs->sysctl_amemthresh-availmem);
164                 } else {
165                         ipvs->drop_rate = 0;
166                         ipvs->sysctl_drop_packet = 1;
167                 }
168                 break;
169         case 3:
170                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
171                 break;
172         }
173         spin_unlock(&ipvs->droppacket_lock);
174
175         /* secure_tcp */
176         spin_lock(&ipvs->securetcp_lock);
177         switch (ipvs->sysctl_secure_tcp) {
178         case 0:
179                 if (old_secure_tcp >= 2)
180                         to_change = 0;
181                 break;
182         case 1:
183                 if (nomem) {
184                         if (old_secure_tcp < 2)
185                                 to_change = 1;
186                         ipvs->sysctl_secure_tcp = 2;
187                 } else {
188                         if (old_secure_tcp >= 2)
189                                 to_change = 0;
190                 }
191                 break;
192         case 2:
193                 if (nomem) {
194                         if (old_secure_tcp < 2)
195                                 to_change = 1;
196                 } else {
197                         if (old_secure_tcp >= 2)
198                                 to_change = 0;
199                         ipvs->sysctl_secure_tcp = 1;
200                 }
201                 break;
202         case 3:
203                 if (old_secure_tcp < 2)
204                         to_change = 1;
205                 break;
206         }
207         old_secure_tcp = ipvs->sysctl_secure_tcp;
208         if (to_change >= 0)
209                 ip_vs_protocol_timeout_change(ipvs,
210                                               ipvs->sysctl_secure_tcp > 1);
211         spin_unlock(&ipvs->securetcp_lock);
212
213         local_bh_enable();
214 }
215
216
217 /*
218  *      Timer for checking the defense
219  */
220 #define DEFENSE_TIMER_PERIOD    1*HZ
221
222 static void defense_work_handler(struct work_struct *work)
223 {
224         struct netns_ipvs *ipvs =
225                 container_of(work, struct netns_ipvs, defense_work.work);
226
227         update_defense_level(ipvs);
228         if (atomic_read(&ipvs->dropentry))
229                 ip_vs_random_dropentry(ipvs->net);
230         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
231 }
232 #endif
233
234 int
235 ip_vs_use_count_inc(void)
236 {
237         return try_module_get(THIS_MODULE);
238 }
239
240 void
241 ip_vs_use_count_dec(void)
242 {
243         module_put(THIS_MODULE);
244 }
245
246
247 /*
248  *      Hash table: for virtual service lookups
249  */
250 #define IP_VS_SVC_TAB_BITS 8
251 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
252 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
253
254 /* the service table hashed by <protocol, addr, port> */
255 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
256 /* the service table hashed by fwmark */
257 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
258
259
260 /*
261  *      Returns hash value for virtual service
262  */
263 static inline unsigned
264 ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
265                   const union nf_inet_addr *addr, __be16 port)
266 {
267         register unsigned porth = ntohs(port);
268         __be32 addr_fold = addr->ip;
269
270 #ifdef CONFIG_IP_VS_IPV6
271         if (af == AF_INET6)
272                 addr_fold = addr->ip6[0]^addr->ip6[1]^
273                             addr->ip6[2]^addr->ip6[3];
274 #endif
275         addr_fold ^= ((size_t)net>>8);
276
277         return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
278                 & IP_VS_SVC_TAB_MASK;
279 }
280
281 /*
282  *      Returns hash value of fwmark for virtual service lookup
283  */
284 static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
285 {
286         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
287 }
288
289 /*
290  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
291  *      or in the ip_vs_svc_fwm_table by fwmark.
292  *      Should be called with locked tables.
293  */
294 static int ip_vs_svc_hash(struct ip_vs_service *svc)
295 {
296         unsigned hash;
297
298         if (svc->flags & IP_VS_SVC_F_HASHED) {
299                 pr_err("%s(): request for already hashed, called from %pF\n",
300                        __func__, __builtin_return_address(0));
301                 return 0;
302         }
303
304         if (svc->fwmark == 0) {
305                 /*
306                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
307                  */
308                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
309                                          &svc->addr, svc->port);
310                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
311         } else {
312                 /*
313                  *  Hash it by fwmark in svc_fwm_table
314                  */
315                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
316                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
317         }
318
319         svc->flags |= IP_VS_SVC_F_HASHED;
320         /* increase its refcnt because it is referenced by the svc table */
321         atomic_inc(&svc->refcnt);
322         return 1;
323 }
324
325
326 /*
327  *      Unhashes a service from svc_table / svc_fwm_table.
328  *      Should be called with locked tables.
329  */
330 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
331 {
332         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
333                 pr_err("%s(): request for unhash flagged, called from %pF\n",
334                        __func__, __builtin_return_address(0));
335                 return 0;
336         }
337
338         if (svc->fwmark == 0) {
339                 /* Remove it from the svc_table table */
340                 list_del(&svc->s_list);
341         } else {
342                 /* Remove it from the svc_fwm_table table */
343                 list_del(&svc->f_list);
344         }
345
346         svc->flags &= ~IP_VS_SVC_F_HASHED;
347         atomic_dec(&svc->refcnt);
348         return 1;
349 }
350
351
352 /*
353  *      Get service by {netns, proto,addr,port} in the service table.
354  */
355 static inline struct ip_vs_service *
356 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
357                      const union nf_inet_addr *vaddr, __be16 vport)
358 {
359         unsigned hash;
360         struct ip_vs_service *svc;
361
362         /* Check for "full" addressed entries */
363         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
364
365         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
366                 if ((svc->af == af)
367                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
368                     && (svc->port == vport)
369                     && (svc->protocol == protocol)
370                     && net_eq(svc->net, net)) {
371                         /* HIT */
372                         return svc;
373                 }
374         }
375
376         return NULL;
377 }
378
379
380 /*
381  *      Get service by {fwmark} in the service table.
382  */
383 static inline struct ip_vs_service *
384 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
385 {
386         unsigned hash;
387         struct ip_vs_service *svc;
388
389         /* Check for fwmark addressed entries */
390         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
391
392         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
393                 if (svc->fwmark == fwmark && svc->af == af
394                     && net_eq(svc->net, net)) {
395                         /* HIT */
396                         return svc;
397                 }
398         }
399
400         return NULL;
401 }
402
403 struct ip_vs_service *
404 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
405                   const union nf_inet_addr *vaddr, __be16 vport)
406 {
407         struct ip_vs_service *svc;
408         struct netns_ipvs *ipvs = net_ipvs(net);
409
410         read_lock(&__ip_vs_svc_lock);
411
412         /*
413          *      Check the table hashed by fwmark first
414          */
415         if (fwmark) {
416                 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
417                 if (svc)
418                         goto out;
419         }
420
421         /*
422          *      Check the table hashed by <protocol,addr,port>
423          *      for "full" addressed entries
424          */
425         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
426
427         if (svc == NULL
428             && protocol == IPPROTO_TCP
429             && atomic_read(&ipvs->ftpsvc_counter)
430             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
431                 /*
432                  * Check if ftp service entry exists, the packet
433                  * might belong to FTP data connections.
434                  */
435                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
436         }
437
438         if (svc == NULL
439             && atomic_read(&ipvs->nullsvc_counter)) {
440                 /*
441                  * Check if the catch-all port (port zero) exists
442                  */
443                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
444         }
445
446   out:
447         if (svc)
448                 atomic_inc(&svc->usecnt);
449         read_unlock(&__ip_vs_svc_lock);
450
451         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
452                       fwmark, ip_vs_proto_name(protocol),
453                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
454                       svc ? "hit" : "not hit");
455
456         return svc;
457 }
458
459
460 static inline void
461 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462 {
463         atomic_inc(&svc->refcnt);
464         dest->svc = svc;
465 }
466
467 static void
468 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
469 {
470         struct ip_vs_service *svc = dest->svc;
471
472         dest->svc = NULL;
473         if (atomic_dec_and_test(&svc->refcnt)) {
474                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
475                               svc->fwmark,
476                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
477                               ntohs(svc->port), atomic_read(&svc->usecnt));
478                 free_percpu(svc->stats.cpustats);
479                 kfree(svc);
480         }
481 }
482
483
484 /*
485  *      Returns hash value for real service
486  */
487 static inline unsigned ip_vs_rs_hashkey(int af,
488                                             const union nf_inet_addr *addr,
489                                             __be16 port)
490 {
491         register unsigned porth = ntohs(port);
492         __be32 addr_fold = addr->ip;
493
494 #ifdef CONFIG_IP_VS_IPV6
495         if (af == AF_INET6)
496                 addr_fold = addr->ip6[0]^addr->ip6[1]^
497                             addr->ip6[2]^addr->ip6[3];
498 #endif
499
500         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
501                 & IP_VS_RTAB_MASK;
502 }
503
504 /*
505  *      Hashes ip_vs_dest in rs_table by <proto,addr,port>.
506  *      should be called with locked tables.
507  */
508 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
509 {
510         unsigned hash;
511
512         if (!list_empty(&dest->d_list)) {
513                 return 0;
514         }
515
516         /*
517          *      Hash by proto,addr,port,
518          *      which are the parameters of the real service.
519          */
520         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
521
522         list_add(&dest->d_list, &ipvs->rs_table[hash]);
523
524         return 1;
525 }
526
527 /*
528  *      UNhashes ip_vs_dest from rs_table.
529  *      should be called with locked tables.
530  */
531 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
532 {
533         /*
534          * Remove it from the rs_table table.
535          */
536         if (!list_empty(&dest->d_list)) {
537                 list_del(&dest->d_list);
538                 INIT_LIST_HEAD(&dest->d_list);
539         }
540
541         return 1;
542 }
543
544 /*
545  *      Lookup real service by <proto,addr,port> in the real service table.
546  */
547 struct ip_vs_dest *
548 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
549                           const union nf_inet_addr *daddr,
550                           __be16 dport)
551 {
552         struct netns_ipvs *ipvs = net_ipvs(net);
553         unsigned hash;
554         struct ip_vs_dest *dest;
555
556         /*
557          *      Check for "full" addressed entries
558          *      Return the first found entry
559          */
560         hash = ip_vs_rs_hashkey(af, daddr, dport);
561
562         read_lock(&ipvs->rs_lock);
563         list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
564                 if ((dest->af == af)
565                     && ip_vs_addr_equal(af, &dest->addr, daddr)
566                     && (dest->port == dport)
567                     && ((dest->protocol == protocol) ||
568                         dest->vfwmark)) {
569                         /* HIT */
570                         read_unlock(&ipvs->rs_lock);
571                         return dest;
572                 }
573         }
574         read_unlock(&ipvs->rs_lock);
575
576         return NULL;
577 }
578
579 /*
580  *      Lookup destination by {addr,port} in the given service
581  */
582 static struct ip_vs_dest *
583 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
584                   __be16 dport)
585 {
586         struct ip_vs_dest *dest;
587
588         /*
589          * Find the destination for the given service
590          */
591         list_for_each_entry(dest, &svc->destinations, n_list) {
592                 if ((dest->af == svc->af)
593                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
594                     && (dest->port == dport)) {
595                         /* HIT */
596                         return dest;
597                 }
598         }
599
600         return NULL;
601 }
602
603 /*
604  * Find destination by {daddr,dport,vaddr,protocol}
605  * Cretaed to be used in ip_vs_process_message() in
606  * the backup synchronization daemon. It finds the
607  * destination to be bound to the received connection
608  * on the backup.
609  *
610  * ip_vs_lookup_real_service() looked promissing, but
611  * seems not working as expected.
612  */
613 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
614                                    const union nf_inet_addr *daddr,
615                                    __be16 dport,
616                                    const union nf_inet_addr *vaddr,
617                                    __be16 vport, __u16 protocol, __u32 fwmark)
618 {
619         struct ip_vs_dest *dest;
620         struct ip_vs_service *svc;
621
622         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
623         if (!svc)
624                 return NULL;
625         dest = ip_vs_lookup_dest(svc, daddr, dport);
626         if (dest)
627                 atomic_inc(&dest->refcnt);
628         ip_vs_service_put(svc);
629         return dest;
630 }
631
632 /*
633  *  Lookup dest by {svc,addr,port} in the destination trash.
634  *  The destination trash is used to hold the destinations that are removed
635  *  from the service table but are still referenced by some conn entries.
636  *  The reason to add the destination trash is when the dest is temporary
637  *  down (either by administrator or by monitor program), the dest can be
638  *  picked back from the trash, the remaining connections to the dest can
639  *  continue, and the counting information of the dest is also useful for
640  *  scheduling.
641  */
642 static struct ip_vs_dest *
643 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
644                      __be16 dport)
645 {
646         struct ip_vs_dest *dest, *nxt;
647         struct netns_ipvs *ipvs = net_ipvs(svc->net);
648
649         /*
650          * Find the destination in trash
651          */
652         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
653                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
654                               "dest->refcnt=%d\n",
655                               dest->vfwmark,
656                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
657                               ntohs(dest->port),
658                               atomic_read(&dest->refcnt));
659                 if (dest->af == svc->af &&
660                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
661                     dest->port == dport &&
662                     dest->vfwmark == svc->fwmark &&
663                     dest->protocol == svc->protocol &&
664                     (svc->fwmark ||
665                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
666                       dest->vport == svc->port))) {
667                         /* HIT */
668                         return dest;
669                 }
670
671                 /*
672                  * Try to purge the destination from trash if not referenced
673                  */
674                 if (atomic_read(&dest->refcnt) == 1) {
675                         IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
676                                       "from trash\n",
677                                       dest->vfwmark,
678                                       IP_VS_DBG_ADDR(svc->af, &dest->addr),
679                                       ntohs(dest->port));
680                         list_del(&dest->n_list);
681                         ip_vs_dst_reset(dest);
682                         __ip_vs_unbind_svc(dest);
683                         free_percpu(dest->stats.cpustats);
684                         kfree(dest);
685                 }
686         }
687
688         return NULL;
689 }
690
691
692 /*
693  *  Clean up all the destinations in the trash
694  *  Called by the ip_vs_control_cleanup()
695  *
696  *  When the ip_vs_control_clearup is activated by ipvs module exit,
697  *  the service tables must have been flushed and all the connections
698  *  are expired, and the refcnt of each destination in the trash must
699  *  be 1, so we simply release them here.
700  */
701 static void ip_vs_trash_cleanup(struct net *net)
702 {
703         struct ip_vs_dest *dest, *nxt;
704         struct netns_ipvs *ipvs = net_ipvs(net);
705
706         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
707                 list_del(&dest->n_list);
708                 ip_vs_dst_reset(dest);
709                 __ip_vs_unbind_svc(dest);
710                 free_percpu(dest->stats.cpustats);
711                 kfree(dest);
712         }
713 }
714
715 static void
716 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
717 {
718 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
719
720         spin_lock_bh(&src->lock);
721
722         IP_VS_SHOW_STATS_COUNTER(conns);
723         IP_VS_SHOW_STATS_COUNTER(inpkts);
724         IP_VS_SHOW_STATS_COUNTER(outpkts);
725         IP_VS_SHOW_STATS_COUNTER(inbytes);
726         IP_VS_SHOW_STATS_COUNTER(outbytes);
727
728         ip_vs_read_estimator(dst, src);
729
730         spin_unlock_bh(&src->lock);
731 }
732
733 static void
734 ip_vs_zero_stats(struct ip_vs_stats *stats)
735 {
736         spin_lock_bh(&stats->lock);
737
738         /* get current counters as zero point, rates are zeroed */
739
740 #define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
741
742         IP_VS_ZERO_STATS_COUNTER(conns);
743         IP_VS_ZERO_STATS_COUNTER(inpkts);
744         IP_VS_ZERO_STATS_COUNTER(outpkts);
745         IP_VS_ZERO_STATS_COUNTER(inbytes);
746         IP_VS_ZERO_STATS_COUNTER(outbytes);
747
748         ip_vs_zero_estimator(stats);
749
750         spin_unlock_bh(&stats->lock);
751 }
752
753 /*
754  *      Update a destination in the given service
755  */
756 static void
757 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
758                     struct ip_vs_dest_user_kern *udest, int add)
759 {
760         struct netns_ipvs *ipvs = net_ipvs(svc->net);
761         int conn_flags;
762
763         /* set the weight and the flags */
764         atomic_set(&dest->weight, udest->weight);
765         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
766         conn_flags |= IP_VS_CONN_F_INACTIVE;
767
768         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
769         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
770                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
771         } else {
772                 /*
773                  *    Put the real service in rs_table if not present.
774                  *    For now only for NAT!
775                  */
776                 write_lock_bh(&ipvs->rs_lock);
777                 ip_vs_rs_hash(ipvs, dest);
778                 write_unlock_bh(&ipvs->rs_lock);
779         }
780         atomic_set(&dest->conn_flags, conn_flags);
781
782         /* bind the service */
783         if (!dest->svc) {
784                 __ip_vs_bind_svc(dest, svc);
785         } else {
786                 if (dest->svc != svc) {
787                         __ip_vs_unbind_svc(dest);
788                         ip_vs_zero_stats(&dest->stats);
789                         __ip_vs_bind_svc(dest, svc);
790                 }
791         }
792
793         /* set the dest status flags */
794         dest->flags |= IP_VS_DEST_F_AVAILABLE;
795
796         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
797                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
798         dest->u_threshold = udest->u_threshold;
799         dest->l_threshold = udest->l_threshold;
800
801         spin_lock_bh(&dest->dst_lock);
802         ip_vs_dst_reset(dest);
803         spin_unlock_bh(&dest->dst_lock);
804
805         if (add)
806                 ip_vs_start_estimator(svc->net, &dest->stats);
807
808         write_lock_bh(&__ip_vs_svc_lock);
809
810         /* Wait until all other svc users go away */
811         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
812
813         if (add) {
814                 list_add(&dest->n_list, &svc->destinations);
815                 svc->num_dests++;
816         }
817
818         /* call the update_service, because server weight may be changed */
819         if (svc->scheduler->update_service)
820                 svc->scheduler->update_service(svc);
821
822         write_unlock_bh(&__ip_vs_svc_lock);
823 }
824
825
826 /*
827  *      Create a destination for the given service
828  */
829 static int
830 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
831                struct ip_vs_dest **dest_p)
832 {
833         struct ip_vs_dest *dest;
834         unsigned atype;
835
836         EnterFunction(2);
837
838 #ifdef CONFIG_IP_VS_IPV6
839         if (svc->af == AF_INET6) {
840                 atype = ipv6_addr_type(&udest->addr.in6);
841                 if ((!(atype & IPV6_ADDR_UNICAST) ||
842                         atype & IPV6_ADDR_LINKLOCAL) &&
843                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
844                         return -EINVAL;
845         } else
846 #endif
847         {
848                 atype = inet_addr_type(svc->net, udest->addr.ip);
849                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
850                         return -EINVAL;
851         }
852
853         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
854         if (dest == NULL) {
855                 pr_err("%s(): no memory.\n", __func__);
856                 return -ENOMEM;
857         }
858         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
859         if (!dest->stats.cpustats) {
860                 pr_err("%s() alloc_percpu failed\n", __func__);
861                 goto err_alloc;
862         }
863
864         dest->af = svc->af;
865         dest->protocol = svc->protocol;
866         dest->vaddr = svc->addr;
867         dest->vport = svc->port;
868         dest->vfwmark = svc->fwmark;
869         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
870         dest->port = udest->port;
871
872         atomic_set(&dest->activeconns, 0);
873         atomic_set(&dest->inactconns, 0);
874         atomic_set(&dest->persistconns, 0);
875         atomic_set(&dest->refcnt, 1);
876
877         INIT_LIST_HEAD(&dest->d_list);
878         spin_lock_init(&dest->dst_lock);
879         spin_lock_init(&dest->stats.lock);
880         __ip_vs_update_dest(svc, dest, udest, 1);
881
882         *dest_p = dest;
883
884         LeaveFunction(2);
885         return 0;
886
887 err_alloc:
888         kfree(dest);
889         return -ENOMEM;
890 }
891
892
893 /*
894  *      Add a destination into an existing service
895  */
896 static int
897 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
898 {
899         struct ip_vs_dest *dest;
900         union nf_inet_addr daddr;
901         __be16 dport = udest->port;
902         int ret;
903
904         EnterFunction(2);
905
906         if (udest->weight < 0) {
907                 pr_err("%s(): server weight less than zero\n", __func__);
908                 return -ERANGE;
909         }
910
911         if (udest->l_threshold > udest->u_threshold) {
912                 pr_err("%s(): lower threshold is higher than upper threshold\n",
913                         __func__);
914                 return -ERANGE;
915         }
916
917         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
918
919         /*
920          * Check if the dest already exists in the list
921          */
922         dest = ip_vs_lookup_dest(svc, &daddr, dport);
923
924         if (dest != NULL) {
925                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
926                 return -EEXIST;
927         }
928
929         /*
930          * Check if the dest already exists in the trash and
931          * is from the same service
932          */
933         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
934
935         if (dest != NULL) {
936                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
937                               "dest->refcnt=%d, service %u/%s:%u\n",
938                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
939                               atomic_read(&dest->refcnt),
940                               dest->vfwmark,
941                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
942                               ntohs(dest->vport));
943
944                 /*
945                  * Get the destination from the trash
946                  */
947                 list_del(&dest->n_list);
948
949                 __ip_vs_update_dest(svc, dest, udest, 1);
950                 ret = 0;
951         } else {
952                 /*
953                  * Allocate and initialize the dest structure
954                  */
955                 ret = ip_vs_new_dest(svc, udest, &dest);
956         }
957         LeaveFunction(2);
958
959         return ret;
960 }
961
962
963 /*
964  *      Edit a destination in the given service
965  */
966 static int
967 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
968 {
969         struct ip_vs_dest *dest;
970         union nf_inet_addr daddr;
971         __be16 dport = udest->port;
972
973         EnterFunction(2);
974
975         if (udest->weight < 0) {
976                 pr_err("%s(): server weight less than zero\n", __func__);
977                 return -ERANGE;
978         }
979
980         if (udest->l_threshold > udest->u_threshold) {
981                 pr_err("%s(): lower threshold is higher than upper threshold\n",
982                         __func__);
983                 return -ERANGE;
984         }
985
986         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
987
988         /*
989          *  Lookup the destination list
990          */
991         dest = ip_vs_lookup_dest(svc, &daddr, dport);
992
993         if (dest == NULL) {
994                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
995                 return -ENOENT;
996         }
997
998         __ip_vs_update_dest(svc, dest, udest, 0);
999         LeaveFunction(2);
1000
1001         return 0;
1002 }
1003
1004
1005 /*
1006  *      Delete a destination (must be already unlinked from the service)
1007  */
1008 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
1009 {
1010         struct netns_ipvs *ipvs = net_ipvs(net);
1011
1012         ip_vs_stop_estimator(net, &dest->stats);
1013
1014         /*
1015          *  Remove it from the d-linked list with the real services.
1016          */
1017         write_lock_bh(&ipvs->rs_lock);
1018         ip_vs_rs_unhash(dest);
1019         write_unlock_bh(&ipvs->rs_lock);
1020
1021         /*
1022          *  Decrease the refcnt of the dest, and free the dest
1023          *  if nobody refers to it (refcnt=0). Otherwise, throw
1024          *  the destination into the trash.
1025          */
1026         if (atomic_dec_and_test(&dest->refcnt)) {
1027                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1028                               dest->vfwmark,
1029                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1030                               ntohs(dest->port));
1031                 ip_vs_dst_reset(dest);
1032                 /* simply decrease svc->refcnt here, let the caller check
1033                    and release the service if nobody refers to it.
1034                    Only user context can release destination and service,
1035                    and only one user context can update virtual service at a
1036                    time, so the operation here is OK */
1037                 atomic_dec(&dest->svc->refcnt);
1038                 free_percpu(dest->stats.cpustats);
1039                 kfree(dest);
1040         } else {
1041                 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1042                               "dest->refcnt=%d\n",
1043                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1044                               ntohs(dest->port),
1045                               atomic_read(&dest->refcnt));
1046                 list_add(&dest->n_list, &ipvs->dest_trash);
1047                 atomic_inc(&dest->refcnt);
1048         }
1049 }
1050
1051
1052 /*
1053  *      Unlink a destination from the given service
1054  */
1055 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1056                                 struct ip_vs_dest *dest,
1057                                 int svcupd)
1058 {
1059         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1060
1061         /*
1062          *  Remove it from the d-linked destination list.
1063          */
1064         list_del(&dest->n_list);
1065         svc->num_dests--;
1066
1067         /*
1068          *  Call the update_service function of its scheduler
1069          */
1070         if (svcupd && svc->scheduler->update_service)
1071                         svc->scheduler->update_service(svc);
1072 }
1073
1074
1075 /*
1076  *      Delete a destination server in the given service
1077  */
1078 static int
1079 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1080 {
1081         struct ip_vs_dest *dest;
1082         __be16 dport = udest->port;
1083
1084         EnterFunction(2);
1085
1086         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1087
1088         if (dest == NULL) {
1089                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1090                 return -ENOENT;
1091         }
1092
1093         write_lock_bh(&__ip_vs_svc_lock);
1094
1095         /*
1096          *      Wait until all other svc users go away.
1097          */
1098         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1099
1100         /*
1101          *      Unlink dest from the service
1102          */
1103         __ip_vs_unlink_dest(svc, dest, 1);
1104
1105         write_unlock_bh(&__ip_vs_svc_lock);
1106
1107         /*
1108          *      Delete the destination
1109          */
1110         __ip_vs_del_dest(svc->net, dest);
1111
1112         LeaveFunction(2);
1113
1114         return 0;
1115 }
1116
1117
1118 /*
1119  *      Add a service into the service hash table
1120  */
1121 static int
1122 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1123                   struct ip_vs_service **svc_p)
1124 {
1125         int ret = 0;
1126         struct ip_vs_scheduler *sched = NULL;
1127         struct ip_vs_pe *pe = NULL;
1128         struct ip_vs_service *svc = NULL;
1129         struct netns_ipvs *ipvs = net_ipvs(net);
1130
1131         /* increase the module use count */
1132         ip_vs_use_count_inc();
1133
1134         /* Lookup the scheduler by 'u->sched_name' */
1135         sched = ip_vs_scheduler_get(u->sched_name);
1136         if (sched == NULL) {
1137                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1138                 ret = -ENOENT;
1139                 goto out_err;
1140         }
1141
1142         if (u->pe_name && *u->pe_name) {
1143                 pe = ip_vs_pe_getbyname(u->pe_name);
1144                 if (pe == NULL) {
1145                         pr_info("persistence engine module ip_vs_pe_%s "
1146                                 "not found\n", u->pe_name);
1147                         ret = -ENOENT;
1148                         goto out_err;
1149                 }
1150         }
1151
1152 #ifdef CONFIG_IP_VS_IPV6
1153         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1154                 ret = -EINVAL;
1155                 goto out_err;
1156         }
1157 #endif
1158
1159         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1160         if (svc == NULL) {
1161                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1162                 ret = -ENOMEM;
1163                 goto out_err;
1164         }
1165         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1166         if (!svc->stats.cpustats) {
1167                 pr_err("%s() alloc_percpu failed\n", __func__);
1168                 goto out_err;
1169         }
1170
1171         /* I'm the first user of the service */
1172         atomic_set(&svc->usecnt, 0);
1173         atomic_set(&svc->refcnt, 0);
1174
1175         svc->af = u->af;
1176         svc->protocol = u->protocol;
1177         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1178         svc->port = u->port;
1179         svc->fwmark = u->fwmark;
1180         svc->flags = u->flags;
1181         svc->timeout = u->timeout * HZ;
1182         svc->netmask = u->netmask;
1183         svc->net = net;
1184
1185         INIT_LIST_HEAD(&svc->destinations);
1186         rwlock_init(&svc->sched_lock);
1187         spin_lock_init(&svc->stats.lock);
1188
1189         /* Bind the scheduler */
1190         ret = ip_vs_bind_scheduler(svc, sched);
1191         if (ret)
1192                 goto out_err;
1193         sched = NULL;
1194
1195         /* Bind the ct retriever */
1196         ip_vs_bind_pe(svc, pe);
1197         pe = NULL;
1198
1199         /* Update the virtual service counters */
1200         if (svc->port == FTPPORT)
1201                 atomic_inc(&ipvs->ftpsvc_counter);
1202         else if (svc->port == 0)
1203                 atomic_inc(&ipvs->nullsvc_counter);
1204
1205         ip_vs_start_estimator(net, &svc->stats);
1206
1207         /* Count only IPv4 services for old get/setsockopt interface */
1208         if (svc->af == AF_INET)
1209                 ipvs->num_services++;
1210
1211         /* Hash the service into the service table */
1212         write_lock_bh(&__ip_vs_svc_lock);
1213         ip_vs_svc_hash(svc);
1214         write_unlock_bh(&__ip_vs_svc_lock);
1215
1216         *svc_p = svc;
1217         return 0;
1218
1219
1220  out_err:
1221         if (svc != NULL) {
1222                 ip_vs_unbind_scheduler(svc);
1223                 if (svc->inc) {
1224                         local_bh_disable();
1225                         ip_vs_app_inc_put(svc->inc);
1226                         local_bh_enable();
1227                 }
1228                 if (svc->stats.cpustats)
1229                         free_percpu(svc->stats.cpustats);
1230                 kfree(svc);
1231         }
1232         ip_vs_scheduler_put(sched);
1233         ip_vs_pe_put(pe);
1234
1235         /* decrease the module use count */
1236         ip_vs_use_count_dec();
1237
1238         return ret;
1239 }
1240
1241
1242 /*
1243  *      Edit a service and bind it with a new scheduler
1244  */
1245 static int
1246 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1247 {
1248         struct ip_vs_scheduler *sched, *old_sched;
1249         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1250         int ret = 0;
1251
1252         /*
1253          * Lookup the scheduler, by 'u->sched_name'
1254          */
1255         sched = ip_vs_scheduler_get(u->sched_name);
1256         if (sched == NULL) {
1257                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1258                 return -ENOENT;
1259         }
1260         old_sched = sched;
1261
1262         if (u->pe_name && *u->pe_name) {
1263                 pe = ip_vs_pe_getbyname(u->pe_name);
1264                 if (pe == NULL) {
1265                         pr_info("persistence engine module ip_vs_pe_%s "
1266                                 "not found\n", u->pe_name);
1267                         ret = -ENOENT;
1268                         goto out;
1269                 }
1270                 old_pe = pe;
1271         }
1272
1273 #ifdef CONFIG_IP_VS_IPV6
1274         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1275                 ret = -EINVAL;
1276                 goto out;
1277         }
1278 #endif
1279
1280         write_lock_bh(&__ip_vs_svc_lock);
1281
1282         /*
1283          * Wait until all other svc users go away.
1284          */
1285         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1286
1287         /*
1288          * Set the flags and timeout value
1289          */
1290         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1291         svc->timeout = u->timeout * HZ;
1292         svc->netmask = u->netmask;
1293
1294         old_sched = svc->scheduler;
1295         if (sched != old_sched) {
1296                 /*
1297                  * Unbind the old scheduler
1298                  */
1299                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1300                         old_sched = sched;
1301                         goto out_unlock;
1302                 }
1303
1304                 /*
1305                  * Bind the new scheduler
1306                  */
1307                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1308                         /*
1309                          * If ip_vs_bind_scheduler fails, restore the old
1310                          * scheduler.
1311                          * The main reason of failure is out of memory.
1312                          *
1313                          * The question is if the old scheduler can be
1314                          * restored all the time. TODO: if it cannot be
1315                          * restored some time, we must delete the service,
1316                          * otherwise the system may crash.
1317                          */
1318                         ip_vs_bind_scheduler(svc, old_sched);
1319                         old_sched = sched;
1320                         goto out_unlock;
1321                 }
1322         }
1323
1324         old_pe = svc->pe;
1325         if (pe != old_pe) {
1326                 ip_vs_unbind_pe(svc);
1327                 ip_vs_bind_pe(svc, pe);
1328         }
1329
1330   out_unlock:
1331         write_unlock_bh(&__ip_vs_svc_lock);
1332   out:
1333         ip_vs_scheduler_put(old_sched);
1334         ip_vs_pe_put(old_pe);
1335         return ret;
1336 }
1337
1338
1339 /*
1340  *      Delete a service from the service list
1341  *      - The service must be unlinked, unlocked and not referenced!
1342  *      - We are called under _bh lock
1343  */
1344 static void __ip_vs_del_service(struct ip_vs_service *svc)
1345 {
1346         struct ip_vs_dest *dest, *nxt;
1347         struct ip_vs_scheduler *old_sched;
1348         struct ip_vs_pe *old_pe;
1349         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1350
1351         pr_info("%s: enter\n", __func__);
1352
1353         /* Count only IPv4 services for old get/setsockopt interface */
1354         if (svc->af == AF_INET)
1355                 ipvs->num_services--;
1356
1357         ip_vs_stop_estimator(svc->net, &svc->stats);
1358
1359         /* Unbind scheduler */
1360         old_sched = svc->scheduler;
1361         ip_vs_unbind_scheduler(svc);
1362         ip_vs_scheduler_put(old_sched);
1363
1364         /* Unbind persistence engine */
1365         old_pe = svc->pe;
1366         ip_vs_unbind_pe(svc);
1367         ip_vs_pe_put(old_pe);
1368
1369         /* Unbind app inc */
1370         if (svc->inc) {
1371                 ip_vs_app_inc_put(svc->inc);
1372                 svc->inc = NULL;
1373         }
1374
1375         /*
1376          *    Unlink the whole destination list
1377          */
1378         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1379                 __ip_vs_unlink_dest(svc, dest, 0);
1380                 __ip_vs_del_dest(svc->net, dest);
1381         }
1382
1383         /*
1384          *    Update the virtual service counters
1385          */
1386         if (svc->port == FTPPORT)
1387                 atomic_dec(&ipvs->ftpsvc_counter);
1388         else if (svc->port == 0)
1389                 atomic_dec(&ipvs->nullsvc_counter);
1390
1391         /*
1392          *    Free the service if nobody refers to it
1393          */
1394         if (atomic_read(&svc->refcnt) == 0) {
1395                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1396                               svc->fwmark,
1397                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1398                               ntohs(svc->port), atomic_read(&svc->usecnt));
1399                 free_percpu(svc->stats.cpustats);
1400                 kfree(svc);
1401         }
1402
1403         /* decrease the module use count */
1404         ip_vs_use_count_dec();
1405 }
1406
1407 /*
1408  * Unlink a service from list and try to delete it if its refcnt reached 0
1409  */
1410 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1411 {
1412         /*
1413          * Unhash it from the service table
1414          */
1415         write_lock_bh(&__ip_vs_svc_lock);
1416
1417         ip_vs_svc_unhash(svc);
1418
1419         /*
1420          * Wait until all the svc users go away.
1421          */
1422         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1423
1424         __ip_vs_del_service(svc);
1425
1426         write_unlock_bh(&__ip_vs_svc_lock);
1427 }
1428
1429 /*
1430  *      Delete a service from the service list
1431  */
1432 static int ip_vs_del_service(struct ip_vs_service *svc)
1433 {
1434         if (svc == NULL)
1435                 return -EEXIST;
1436         ip_vs_unlink_service(svc);
1437
1438         return 0;
1439 }
1440
1441
1442 /*
1443  *      Flush all the virtual services
1444  */
1445 static int ip_vs_flush(struct net *net)
1446 {
1447         int idx;
1448         struct ip_vs_service *svc, *nxt;
1449
1450         /*
1451          * Flush the service table hashed by <netns,protocol,addr,port>
1452          */
1453         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1454                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1455                                          s_list) {
1456                         if (net_eq(svc->net, net))
1457                                 ip_vs_unlink_service(svc);
1458                 }
1459         }
1460
1461         /*
1462          * Flush the service table hashed by fwmark
1463          */
1464         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1465                 list_for_each_entry_safe(svc, nxt,
1466                                          &ip_vs_svc_fwm_table[idx], f_list) {
1467                         if (net_eq(svc->net, net))
1468                                 ip_vs_unlink_service(svc);
1469                 }
1470         }
1471
1472         return 0;
1473 }
1474
1475
1476 /*
1477  *      Zero counters in a service or all services
1478  */
1479 static int ip_vs_zero_service(struct ip_vs_service *svc)
1480 {
1481         struct ip_vs_dest *dest;
1482
1483         write_lock_bh(&__ip_vs_svc_lock);
1484         list_for_each_entry(dest, &svc->destinations, n_list) {
1485                 ip_vs_zero_stats(&dest->stats);
1486         }
1487         ip_vs_zero_stats(&svc->stats);
1488         write_unlock_bh(&__ip_vs_svc_lock);
1489         return 0;
1490 }
1491
1492 static int ip_vs_zero_all(struct net *net)
1493 {
1494         int idx;
1495         struct ip_vs_service *svc;
1496
1497         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1498                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1499                         if (net_eq(svc->net, net))
1500                                 ip_vs_zero_service(svc);
1501                 }
1502         }
1503
1504         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1505                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1506                         if (net_eq(svc->net, net))
1507                                 ip_vs_zero_service(svc);
1508                 }
1509         }
1510
1511         ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
1512         return 0;
1513 }
1514
1515 #ifdef CONFIG_SYSCTL
1516 static int
1517 proc_do_defense_mode(ctl_table *table, int write,
1518                      void __user *buffer, size_t *lenp, loff_t *ppos)
1519 {
1520         struct net *net = current->nsproxy->net_ns;
1521         int *valp = table->data;
1522         int val = *valp;
1523         int rc;
1524
1525         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1526         if (write && (*valp != val)) {
1527                 if ((*valp < 0) || (*valp > 3)) {
1528                         /* Restore the correct value */
1529                         *valp = val;
1530                 } else {
1531                         update_defense_level(net_ipvs(net));
1532                 }
1533         }
1534         return rc;
1535 }
1536
1537 static int
1538 proc_do_sync_threshold(ctl_table *table, int write,
1539                        void __user *buffer, size_t *lenp, loff_t *ppos)
1540 {
1541         int *valp = table->data;
1542         int val[2];
1543         int rc;
1544
1545         /* backup the value first */
1546         memcpy(val, valp, sizeof(val));
1547
1548         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1549         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1550                 /* Restore the correct value */
1551                 memcpy(valp, val, sizeof(val));
1552         }
1553         return rc;
1554 }
1555
1556 static int
1557 proc_do_sync_mode(ctl_table *table, int write,
1558                      void __user *buffer, size_t *lenp, loff_t *ppos)
1559 {
1560         int *valp = table->data;
1561         int val = *valp;
1562         int rc;
1563
1564         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1565         if (write && (*valp != val)) {
1566                 if ((*valp < 0) || (*valp > 1)) {
1567                         /* Restore the correct value */
1568                         *valp = val;
1569                 } else {
1570                         struct net *net = current->nsproxy->net_ns;
1571                         ip_vs_sync_switch_mode(net, val);
1572                 }
1573         }
1574         return rc;
1575 }
1576
1577 /*
1578  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1579  *      Do not change order or insert new entries without
1580  *      align with netns init in __ip_vs_control_init()
1581  */
1582
1583 static struct ctl_table vs_vars[] = {
1584         {
1585                 .procname       = "amemthresh",
1586                 .maxlen         = sizeof(int),
1587                 .mode           = 0644,
1588                 .proc_handler   = proc_dointvec,
1589         },
1590         {
1591                 .procname       = "am_droprate",
1592                 .maxlen         = sizeof(int),
1593                 .mode           = 0644,
1594                 .proc_handler   = proc_dointvec,
1595         },
1596         {
1597                 .procname       = "drop_entry",
1598                 .maxlen         = sizeof(int),
1599                 .mode           = 0644,
1600                 .proc_handler   = proc_do_defense_mode,
1601         },
1602         {
1603                 .procname       = "drop_packet",
1604                 .maxlen         = sizeof(int),
1605                 .mode           = 0644,
1606                 .proc_handler   = proc_do_defense_mode,
1607         },
1608 #ifdef CONFIG_IP_VS_NFCT
1609         {
1610                 .procname       = "conntrack",
1611                 .maxlen         = sizeof(int),
1612                 .mode           = 0644,
1613                 .proc_handler   = &proc_dointvec,
1614         },
1615 #endif
1616         {
1617                 .procname       = "secure_tcp",
1618                 .maxlen         = sizeof(int),
1619                 .mode           = 0644,
1620                 .proc_handler   = proc_do_defense_mode,
1621         },
1622         {
1623                 .procname       = "snat_reroute",
1624                 .maxlen         = sizeof(int),
1625                 .mode           = 0644,
1626                 .proc_handler   = &proc_dointvec,
1627         },
1628         {
1629                 .procname       = "sync_version",
1630                 .maxlen         = sizeof(int),
1631                 .mode           = 0644,
1632                 .proc_handler   = &proc_do_sync_mode,
1633         },
1634         {
1635                 .procname       = "cache_bypass",
1636                 .maxlen         = sizeof(int),
1637                 .mode           = 0644,
1638                 .proc_handler   = proc_dointvec,
1639         },
1640         {
1641                 .procname       = "expire_nodest_conn",
1642                 .maxlen         = sizeof(int),
1643                 .mode           = 0644,
1644                 .proc_handler   = proc_dointvec,
1645         },
1646         {
1647                 .procname       = "expire_quiescent_template",
1648                 .maxlen         = sizeof(int),
1649                 .mode           = 0644,
1650                 .proc_handler   = proc_dointvec,
1651         },
1652         {
1653                 .procname       = "sync_threshold",
1654                 .maxlen         =
1655                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1656                 .mode           = 0644,
1657                 .proc_handler   = proc_do_sync_threshold,
1658         },
1659         {
1660                 .procname       = "nat_icmp_send",
1661                 .maxlen         = sizeof(int),
1662                 .mode           = 0644,
1663                 .proc_handler   = proc_dointvec,
1664         },
1665 #ifdef CONFIG_IP_VS_DEBUG
1666         {
1667                 .procname       = "debug_level",
1668                 .data           = &sysctl_ip_vs_debug_level,
1669                 .maxlen         = sizeof(int),
1670                 .mode           = 0644,
1671                 .proc_handler   = proc_dointvec,
1672         },
1673 #endif
1674 #if 0
1675         {
1676                 .procname       = "timeout_established",
1677                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1678                 .maxlen         = sizeof(int),
1679                 .mode           = 0644,
1680                 .proc_handler   = proc_dointvec_jiffies,
1681         },
1682         {
1683                 .procname       = "timeout_synsent",
1684                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1685                 .maxlen         = sizeof(int),
1686                 .mode           = 0644,
1687                 .proc_handler   = proc_dointvec_jiffies,
1688         },
1689         {
1690                 .procname       = "timeout_synrecv",
1691                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1692                 .maxlen         = sizeof(int),
1693                 .mode           = 0644,
1694                 .proc_handler   = proc_dointvec_jiffies,
1695         },
1696         {
1697                 .procname       = "timeout_finwait",
1698                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1699                 .maxlen         = sizeof(int),
1700                 .mode           = 0644,
1701                 .proc_handler   = proc_dointvec_jiffies,
1702         },
1703         {
1704                 .procname       = "timeout_timewait",
1705                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1706                 .maxlen         = sizeof(int),
1707                 .mode           = 0644,
1708                 .proc_handler   = proc_dointvec_jiffies,
1709         },
1710         {
1711                 .procname       = "timeout_close",
1712                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1713                 .maxlen         = sizeof(int),
1714                 .mode           = 0644,
1715                 .proc_handler   = proc_dointvec_jiffies,
1716         },
1717         {
1718                 .procname       = "timeout_closewait",
1719                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1720                 .maxlen         = sizeof(int),
1721                 .mode           = 0644,
1722                 .proc_handler   = proc_dointvec_jiffies,
1723         },
1724         {
1725                 .procname       = "timeout_lastack",
1726                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1727                 .maxlen         = sizeof(int),
1728                 .mode           = 0644,
1729                 .proc_handler   = proc_dointvec_jiffies,
1730         },
1731         {
1732                 .procname       = "timeout_listen",
1733                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1734                 .maxlen         = sizeof(int),
1735                 .mode           = 0644,
1736                 .proc_handler   = proc_dointvec_jiffies,
1737         },
1738         {
1739                 .procname       = "timeout_synack",
1740                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1741                 .maxlen         = sizeof(int),
1742                 .mode           = 0644,
1743                 .proc_handler   = proc_dointvec_jiffies,
1744         },
1745         {
1746                 .procname       = "timeout_udp",
1747                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1748                 .maxlen         = sizeof(int),
1749                 .mode           = 0644,
1750                 .proc_handler   = proc_dointvec_jiffies,
1751         },
1752         {
1753                 .procname       = "timeout_icmp",
1754                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1755                 .maxlen         = sizeof(int),
1756                 .mode           = 0644,
1757                 .proc_handler   = proc_dointvec_jiffies,
1758         },
1759 #endif
1760         { }
1761 };
1762
1763 const struct ctl_path net_vs_ctl_path[] = {
1764         { .procname = "net", },
1765         { .procname = "ipv4", },
1766         { .procname = "vs", },
1767         { }
1768 };
1769 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1770 #endif
1771
1772 #ifdef CONFIG_PROC_FS
1773
1774 struct ip_vs_iter {
1775         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1776         struct list_head *table;
1777         int bucket;
1778 };
1779
1780 /*
1781  *      Write the contents of the VS rule table to a PROCfs file.
1782  *      (It is kept just for backward compatibility)
1783  */
1784 static inline const char *ip_vs_fwd_name(unsigned flags)
1785 {
1786         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1787         case IP_VS_CONN_F_LOCALNODE:
1788                 return "Local";
1789         case IP_VS_CONN_F_TUNNEL:
1790                 return "Tunnel";
1791         case IP_VS_CONN_F_DROUTE:
1792                 return "Route";
1793         default:
1794                 return "Masq";
1795         }
1796 }
1797
1798
1799 /* Get the Nth entry in the two lists */
1800 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1801 {
1802         struct net *net = seq_file_net(seq);
1803         struct ip_vs_iter *iter = seq->private;
1804         int idx;
1805         struct ip_vs_service *svc;
1806
1807         /* look in hash by protocol */
1808         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1809                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1810                         if (net_eq(svc->net, net) && pos-- == 0) {
1811                                 iter->table = ip_vs_svc_table;
1812                                 iter->bucket = idx;
1813                                 return svc;
1814                         }
1815                 }
1816         }
1817
1818         /* keep looking in fwmark */
1819         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1820                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1821                         if (net_eq(svc->net, net) && pos-- == 0) {
1822                                 iter->table = ip_vs_svc_fwm_table;
1823                                 iter->bucket = idx;
1824                                 return svc;
1825                         }
1826                 }
1827         }
1828
1829         return NULL;
1830 }
1831
1832 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1833 __acquires(__ip_vs_svc_lock)
1834 {
1835
1836         read_lock_bh(&__ip_vs_svc_lock);
1837         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1838 }
1839
1840
1841 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1842 {
1843         struct list_head *e;
1844         struct ip_vs_iter *iter;
1845         struct ip_vs_service *svc;
1846
1847         ++*pos;
1848         if (v == SEQ_START_TOKEN)
1849                 return ip_vs_info_array(seq,0);
1850
1851         svc = v;
1852         iter = seq->private;
1853
1854         if (iter->table == ip_vs_svc_table) {
1855                 /* next service in table hashed by protocol */
1856                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1857                         return list_entry(e, struct ip_vs_service, s_list);
1858
1859
1860                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1861                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1862                                             s_list) {
1863                                 return svc;
1864                         }
1865                 }
1866
1867                 iter->table = ip_vs_svc_fwm_table;
1868                 iter->bucket = -1;
1869                 goto scan_fwmark;
1870         }
1871
1872         /* next service in hashed by fwmark */
1873         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1874                 return list_entry(e, struct ip_vs_service, f_list);
1875
1876  scan_fwmark:
1877         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1878                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1879                                     f_list)
1880                         return svc;
1881         }
1882
1883         return NULL;
1884 }
1885
1886 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1887 __releases(__ip_vs_svc_lock)
1888 {
1889         read_unlock_bh(&__ip_vs_svc_lock);
1890 }
1891
1892
1893 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1894 {
1895         if (v == SEQ_START_TOKEN) {
1896                 seq_printf(seq,
1897                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1898                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
1899                 seq_puts(seq,
1900                          "Prot LocalAddress:Port Scheduler Flags\n");
1901                 seq_puts(seq,
1902                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1903         } else {
1904                 const struct ip_vs_service *svc = v;
1905                 const struct ip_vs_iter *iter = seq->private;
1906                 const struct ip_vs_dest *dest;
1907
1908                 if (iter->table == ip_vs_svc_table) {
1909 #ifdef CONFIG_IP_VS_IPV6
1910                         if (svc->af == AF_INET6)
1911                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
1912                                            ip_vs_proto_name(svc->protocol),
1913                                            &svc->addr.in6,
1914                                            ntohs(svc->port),
1915                                            svc->scheduler->name);
1916                         else
1917 #endif
1918                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
1919                                            ip_vs_proto_name(svc->protocol),
1920                                            ntohl(svc->addr.ip),
1921                                            ntohs(svc->port),
1922                                            svc->scheduler->name,
1923                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1924                 } else {
1925                         seq_printf(seq, "FWM  %08X %s %s",
1926                                    svc->fwmark, svc->scheduler->name,
1927                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1928                 }
1929
1930                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1931                         seq_printf(seq, "persistent %d %08X\n",
1932                                 svc->timeout,
1933                                 ntohl(svc->netmask));
1934                 else
1935                         seq_putc(seq, '\n');
1936
1937                 list_for_each_entry(dest, &svc->destinations, n_list) {
1938 #ifdef CONFIG_IP_VS_IPV6
1939                         if (dest->af == AF_INET6)
1940                                 seq_printf(seq,
1941                                            "  -> [%pI6]:%04X"
1942                                            "      %-7s %-6d %-10d %-10d\n",
1943                                            &dest->addr.in6,
1944                                            ntohs(dest->port),
1945                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1946                                            atomic_read(&dest->weight),
1947                                            atomic_read(&dest->activeconns),
1948                                            atomic_read(&dest->inactconns));
1949                         else
1950 #endif
1951                                 seq_printf(seq,
1952                                            "  -> %08X:%04X      "
1953                                            "%-7s %-6d %-10d %-10d\n",
1954                                            ntohl(dest->addr.ip),
1955                                            ntohs(dest->port),
1956                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1957                                            atomic_read(&dest->weight),
1958                                            atomic_read(&dest->activeconns),
1959                                            atomic_read(&dest->inactconns));
1960
1961                 }
1962         }
1963         return 0;
1964 }
1965
1966 static const struct seq_operations ip_vs_info_seq_ops = {
1967         .start = ip_vs_info_seq_start,
1968         .next  = ip_vs_info_seq_next,
1969         .stop  = ip_vs_info_seq_stop,
1970         .show  = ip_vs_info_seq_show,
1971 };
1972
1973 static int ip_vs_info_open(struct inode *inode, struct file *file)
1974 {
1975         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
1976                         sizeof(struct ip_vs_iter));
1977 }
1978
1979 static const struct file_operations ip_vs_info_fops = {
1980         .owner   = THIS_MODULE,
1981         .open    = ip_vs_info_open,
1982         .read    = seq_read,
1983         .llseek  = seq_lseek,
1984         .release = seq_release_private,
1985 };
1986
1987 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1988 {
1989         struct net *net = seq_file_single_net(seq);
1990         struct ip_vs_stats_user show;
1991
1992 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1993         seq_puts(seq,
1994                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1995         seq_printf(seq,
1996                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1997
1998         ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
1999         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
2000                    show.inpkts, show.outpkts,
2001                    (unsigned long long) show.inbytes,
2002                    (unsigned long long) show.outbytes);
2003
2004 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2005         seq_puts(seq,
2006                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2007         seq_printf(seq, "%8X %8X %8X %16X %16X\n",
2008                         show.cps, show.inpps, show.outpps,
2009                         show.inbps, show.outbps);
2010
2011         return 0;
2012 }
2013
2014 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
2015 {
2016         return single_open_net(inode, file, ip_vs_stats_show);
2017 }
2018
2019 static const struct file_operations ip_vs_stats_fops = {
2020         .owner = THIS_MODULE,
2021         .open = ip_vs_stats_seq_open,
2022         .read = seq_read,
2023         .llseek = seq_lseek,
2024         .release = single_release,
2025 };
2026
2027 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2028 {
2029         struct net *net = seq_file_single_net(seq);
2030         struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2031         struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
2032         struct ip_vs_stats_user rates;
2033         int i;
2034
2035 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2036         seq_puts(seq,
2037                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2038         seq_printf(seq,
2039                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2040
2041         for_each_possible_cpu(i) {
2042                 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2043                 unsigned int start;
2044                 __u64 inbytes, outbytes;
2045
2046                 do {
2047                         start = u64_stats_fetch_begin_bh(&u->syncp);
2048                         inbytes = u->ustats.inbytes;
2049                         outbytes = u->ustats.outbytes;
2050                 } while (u64_stats_fetch_retry_bh(&u->syncp, start));
2051
2052                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2053                            i, u->ustats.conns, u->ustats.inpkts,
2054                            u->ustats.outpkts, (__u64)inbytes,
2055                            (__u64)outbytes);
2056         }
2057
2058         spin_lock_bh(&tot_stats->lock);
2059
2060         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2061                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2062                    tot_stats->ustats.outpkts,
2063                    (unsigned long long) tot_stats->ustats.inbytes,
2064                    (unsigned long long) tot_stats->ustats.outbytes);
2065
2066         ip_vs_read_estimator(&rates, tot_stats);
2067
2068         spin_unlock_bh(&tot_stats->lock);
2069
2070 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2071         seq_puts(seq,
2072                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2073         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2074                         rates.cps,
2075                         rates.inpps,
2076                         rates.outpps,
2077                         rates.inbps,
2078                         rates.outbps);
2079
2080         return 0;
2081 }
2082
2083 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2084 {
2085         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2086 }
2087
2088 static const struct file_operations ip_vs_stats_percpu_fops = {
2089         .owner = THIS_MODULE,
2090         .open = ip_vs_stats_percpu_seq_open,
2091         .read = seq_read,
2092         .llseek = seq_lseek,
2093         .release = single_release,
2094 };
2095 #endif
2096
2097 /*
2098  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2099  */
2100 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2101 {
2102 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2103         struct ip_vs_proto_data *pd;
2104 #endif
2105
2106         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2107                   u->tcp_timeout,
2108                   u->tcp_fin_timeout,
2109                   u->udp_timeout);
2110
2111 #ifdef CONFIG_IP_VS_PROTO_TCP
2112         if (u->tcp_timeout) {
2113                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2114                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2115                         = u->tcp_timeout * HZ;
2116         }
2117
2118         if (u->tcp_fin_timeout) {
2119                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2120                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2121                         = u->tcp_fin_timeout * HZ;
2122         }
2123 #endif
2124
2125 #ifdef CONFIG_IP_VS_PROTO_UDP
2126         if (u->udp_timeout) {
2127                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2128                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2129                         = u->udp_timeout * HZ;
2130         }
2131 #endif
2132         return 0;
2133 }
2134
2135
2136 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2137 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2138 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2139                                  sizeof(struct ip_vs_dest_user))
2140 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2141 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2142 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2143
2144 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2145         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2146         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2147         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2148         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2149         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2150         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2151         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2152         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2153         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2154         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2155         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2156 };
2157
2158 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2159                                   struct ip_vs_service_user *usvc_compat)
2160 {
2161         memset(usvc, 0, sizeof(*usvc));
2162
2163         usvc->af                = AF_INET;
2164         usvc->protocol          = usvc_compat->protocol;
2165         usvc->addr.ip           = usvc_compat->addr;
2166         usvc->port              = usvc_compat->port;
2167         usvc->fwmark            = usvc_compat->fwmark;
2168
2169         /* Deep copy of sched_name is not needed here */
2170         usvc->sched_name        = usvc_compat->sched_name;
2171
2172         usvc->flags             = usvc_compat->flags;
2173         usvc->timeout           = usvc_compat->timeout;
2174         usvc->netmask           = usvc_compat->netmask;
2175 }
2176
2177 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2178                                    struct ip_vs_dest_user *udest_compat)
2179 {
2180         memset(udest, 0, sizeof(*udest));
2181
2182         udest->addr.ip          = udest_compat->addr;
2183         udest->port             = udest_compat->port;
2184         udest->conn_flags       = udest_compat->conn_flags;
2185         udest->weight           = udest_compat->weight;
2186         udest->u_threshold      = udest_compat->u_threshold;
2187         udest->l_threshold      = udest_compat->l_threshold;
2188 }
2189
2190 static int
2191 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2192 {
2193         struct net *net = sock_net(sk);
2194         int ret;
2195         unsigned char arg[MAX_ARG_LEN];
2196         struct ip_vs_service_user *usvc_compat;
2197         struct ip_vs_service_user_kern usvc;
2198         struct ip_vs_service *svc;
2199         struct ip_vs_dest_user *udest_compat;
2200         struct ip_vs_dest_user_kern udest;
2201
2202         if (!capable(CAP_NET_ADMIN))
2203                 return -EPERM;
2204
2205         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2206                 return -EINVAL;
2207         if (len < 0 || len >  MAX_ARG_LEN)
2208                 return -EINVAL;
2209         if (len != set_arglen[SET_CMDID(cmd)]) {
2210                 pr_err("set_ctl: len %u != %u\n",
2211                        len, set_arglen[SET_CMDID(cmd)]);
2212                 return -EINVAL;
2213         }
2214
2215         if (copy_from_user(arg, user, len) != 0)
2216                 return -EFAULT;
2217
2218         /* increase the module use count */
2219         ip_vs_use_count_inc();
2220
2221         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2222                 ret = -ERESTARTSYS;
2223                 goto out_dec;
2224         }
2225
2226         if (cmd == IP_VS_SO_SET_FLUSH) {
2227                 /* Flush the virtual service */
2228                 ret = ip_vs_flush(net);
2229                 goto out_unlock;
2230         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2231                 /* Set timeout values for (tcp tcpfin udp) */
2232                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2233                 goto out_unlock;
2234         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2235                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2236                 ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2237                                         dm->syncid);
2238                 goto out_unlock;
2239         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
2240                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2241                 ret = stop_sync_thread(net, dm->state);
2242                 goto out_unlock;
2243         }
2244
2245         usvc_compat = (struct ip_vs_service_user *)arg;
2246         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2247
2248         /* We only use the new structs internally, so copy userspace compat
2249          * structs to extended internal versions */
2250         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2251         ip_vs_copy_udest_compat(&udest, udest_compat);
2252
2253         if (cmd == IP_VS_SO_SET_ZERO) {
2254                 /* if no service address is set, zero counters in all */
2255                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2256                         ret = ip_vs_zero_all(net);
2257                         goto out_unlock;
2258                 }
2259         }
2260
2261         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2262         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2263             usvc.protocol != IPPROTO_SCTP) {
2264                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2265                        usvc.protocol, &usvc.addr.ip,
2266                        ntohs(usvc.port), usvc.sched_name);
2267                 ret = -EFAULT;
2268                 goto out_unlock;
2269         }
2270
2271         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2272         if (usvc.fwmark == 0)
2273                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2274                                            &usvc.addr, usvc.port);
2275         else
2276                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2277
2278         if (cmd != IP_VS_SO_SET_ADD
2279             && (svc == NULL || svc->protocol != usvc.protocol)) {
2280                 ret = -ESRCH;
2281                 goto out_unlock;
2282         }
2283
2284         switch (cmd) {
2285         case IP_VS_SO_SET_ADD:
2286                 if (svc != NULL)
2287                         ret = -EEXIST;
2288                 else
2289                         ret = ip_vs_add_service(net, &usvc, &svc);
2290                 break;
2291         case IP_VS_SO_SET_EDIT:
2292                 ret = ip_vs_edit_service(svc, &usvc);
2293                 break;
2294         case IP_VS_SO_SET_DEL:
2295                 ret = ip_vs_del_service(svc);
2296                 if (!ret)
2297                         goto out_unlock;
2298                 break;
2299         case IP_VS_SO_SET_ZERO:
2300                 ret = ip_vs_zero_service(svc);
2301                 break;
2302         case IP_VS_SO_SET_ADDDEST:
2303                 ret = ip_vs_add_dest(svc, &udest);
2304                 break;
2305         case IP_VS_SO_SET_EDITDEST:
2306                 ret = ip_vs_edit_dest(svc, &udest);
2307                 break;
2308         case IP_VS_SO_SET_DELDEST:
2309                 ret = ip_vs_del_dest(svc, &udest);
2310                 break;
2311         default:
2312                 ret = -EINVAL;
2313         }
2314
2315   out_unlock:
2316         mutex_unlock(&__ip_vs_mutex);
2317   out_dec:
2318         /* decrease the module use count */
2319         ip_vs_use_count_dec();
2320
2321         return ret;
2322 }
2323
2324
2325 static void
2326 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2327 {
2328         dst->protocol = src->protocol;
2329         dst->addr = src->addr.ip;
2330         dst->port = src->port;
2331         dst->fwmark = src->fwmark;
2332         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2333         dst->flags = src->flags;
2334         dst->timeout = src->timeout / HZ;
2335         dst->netmask = src->netmask;
2336         dst->num_dests = src->num_dests;
2337         ip_vs_copy_stats(&dst->stats, &src->stats);
2338 }
2339
2340 static inline int
2341 __ip_vs_get_service_entries(struct net *net,
2342                             const struct ip_vs_get_services *get,
2343                             struct ip_vs_get_services __user *uptr)
2344 {
2345         int idx, count=0;
2346         struct ip_vs_service *svc;
2347         struct ip_vs_service_entry entry;
2348         int ret = 0;
2349
2350         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2351                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2352                         /* Only expose IPv4 entries to old interface */
2353                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2354                                 continue;
2355
2356                         if (count >= get->num_services)
2357                                 goto out;
2358                         memset(&entry, 0, sizeof(entry));
2359                         ip_vs_copy_service(&entry, svc);
2360                         if (copy_to_user(&uptr->entrytable[count],
2361                                          &entry, sizeof(entry))) {
2362                                 ret = -EFAULT;
2363                                 goto out;
2364                         }
2365                         count++;
2366                 }
2367         }
2368
2369         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2370                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2371                         /* Only expose IPv4 entries to old interface */
2372                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2373                                 continue;
2374
2375                         if (count >= get->num_services)
2376                                 goto out;
2377                         memset(&entry, 0, sizeof(entry));
2378                         ip_vs_copy_service(&entry, svc);
2379                         if (copy_to_user(&uptr->entrytable[count],
2380                                          &entry, sizeof(entry))) {
2381                                 ret = -EFAULT;
2382                                 goto out;
2383                         }
2384                         count++;
2385                 }
2386         }
2387   out:
2388         return ret;
2389 }
2390
2391 static inline int
2392 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2393                          struct ip_vs_get_dests __user *uptr)
2394 {
2395         struct ip_vs_service *svc;
2396         union nf_inet_addr addr = { .ip = get->addr };
2397         int ret = 0;
2398
2399         if (get->fwmark)
2400                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2401         else
2402                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2403                                            get->port);
2404
2405         if (svc) {
2406                 int count = 0;
2407                 struct ip_vs_dest *dest;
2408                 struct ip_vs_dest_entry entry;
2409
2410                 list_for_each_entry(dest, &svc->destinations, n_list) {
2411                         if (count >= get->num_dests)
2412                                 break;
2413
2414                         entry.addr = dest->addr.ip;
2415                         entry.port = dest->port;
2416                         entry.conn_flags = atomic_read(&dest->conn_flags);
2417                         entry.weight = atomic_read(&dest->weight);
2418                         entry.u_threshold = dest->u_threshold;
2419                         entry.l_threshold = dest->l_threshold;
2420                         entry.activeconns = atomic_read(&dest->activeconns);
2421                         entry.inactconns = atomic_read(&dest->inactconns);
2422                         entry.persistconns = atomic_read(&dest->persistconns);
2423                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2424                         if (copy_to_user(&uptr->entrytable[count],
2425                                          &entry, sizeof(entry))) {
2426                                 ret = -EFAULT;
2427                                 break;
2428                         }
2429                         count++;
2430                 }
2431         } else
2432                 ret = -ESRCH;
2433         return ret;
2434 }
2435
2436 static inline void
2437 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2438 {
2439 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2440         struct ip_vs_proto_data *pd;
2441 #endif
2442
2443 #ifdef CONFIG_IP_VS_PROTO_TCP
2444         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2445         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2446         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2447 #endif
2448 #ifdef CONFIG_IP_VS_PROTO_UDP
2449         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2450         u->udp_timeout =
2451                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2452 #endif
2453 }
2454
2455
2456 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2457 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2458 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2459 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2460 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2461 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2462 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2463
2464 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2465         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2466         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2467         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2468         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2469         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2470         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2471         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2472 };
2473
2474 static int
2475 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2476 {
2477         unsigned char arg[128];
2478         int ret = 0;
2479         unsigned int copylen;
2480         struct net *net = sock_net(sk);
2481         struct netns_ipvs *ipvs = net_ipvs(net);
2482
2483         BUG_ON(!net);
2484         if (!capable(CAP_NET_ADMIN))
2485                 return -EPERM;
2486
2487         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2488                 return -EINVAL;
2489
2490         if (*len < get_arglen[GET_CMDID(cmd)]) {
2491                 pr_err("get_ctl: len %u < %u\n",
2492                        *len, get_arglen[GET_CMDID(cmd)]);
2493                 return -EINVAL;
2494         }
2495
2496         copylen = get_arglen[GET_CMDID(cmd)];
2497         if (copylen > 128)
2498                 return -EINVAL;
2499
2500         if (copy_from_user(arg, user, copylen) != 0)
2501                 return -EFAULT;
2502
2503         if (mutex_lock_interruptible(&__ip_vs_mutex))
2504                 return -ERESTARTSYS;
2505
2506         switch (cmd) {
2507         case IP_VS_SO_GET_VERSION:
2508         {
2509                 char buf[64];
2510
2511                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2512                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2513                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2514                         ret = -EFAULT;
2515                         goto out;
2516                 }
2517                 *len = strlen(buf)+1;
2518         }
2519         break;
2520
2521         case IP_VS_SO_GET_INFO:
2522         {
2523                 struct ip_vs_getinfo info;
2524                 info.version = IP_VS_VERSION_CODE;
2525                 info.size = ip_vs_conn_tab_size;
2526                 info.num_services = ipvs->num_services;
2527                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2528                         ret = -EFAULT;
2529         }
2530         break;
2531
2532         case IP_VS_SO_GET_SERVICES:
2533         {
2534                 struct ip_vs_get_services *get;
2535                 int size;
2536
2537                 get = (struct ip_vs_get_services *)arg;
2538                 size = sizeof(*get) +
2539                         sizeof(struct ip_vs_service_entry) * get->num_services;
2540                 if (*len != size) {
2541                         pr_err("length: %u != %u\n", *len, size);
2542                         ret = -EINVAL;
2543                         goto out;
2544                 }
2545                 ret = __ip_vs_get_service_entries(net, get, user);
2546         }
2547         break;
2548
2549         case IP_VS_SO_GET_SERVICE:
2550         {
2551                 struct ip_vs_service_entry *entry;
2552                 struct ip_vs_service *svc;
2553                 union nf_inet_addr addr;
2554
2555                 entry = (struct ip_vs_service_entry *)arg;
2556                 addr.ip = entry->addr;
2557                 if (entry->fwmark)
2558                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2559                 else
2560                         svc = __ip_vs_service_find(net, AF_INET,
2561                                                    entry->protocol, &addr,
2562                                                    entry->port);
2563                 if (svc) {
2564                         ip_vs_copy_service(entry, svc);
2565                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2566                                 ret = -EFAULT;
2567                 } else
2568                         ret = -ESRCH;
2569         }
2570         break;
2571
2572         case IP_VS_SO_GET_DESTS:
2573         {
2574                 struct ip_vs_get_dests *get;
2575                 int size;
2576
2577                 get = (struct ip_vs_get_dests *)arg;
2578                 size = sizeof(*get) +
2579                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2580                 if (*len != size) {
2581                         pr_err("length: %u != %u\n", *len, size);
2582                         ret = -EINVAL;
2583                         goto out;
2584                 }
2585                 ret = __ip_vs_get_dest_entries(net, get, user);
2586         }
2587         break;
2588
2589         case IP_VS_SO_GET_TIMEOUT:
2590         {
2591                 struct ip_vs_timeout_user t;
2592
2593                 __ip_vs_get_timeouts(net, &t);
2594                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2595                         ret = -EFAULT;
2596         }
2597         break;
2598
2599         case IP_VS_SO_GET_DAEMON:
2600         {
2601                 struct ip_vs_daemon_user d[2];
2602
2603                 memset(&d, 0, sizeof(d));
2604                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2605                         d[0].state = IP_VS_STATE_MASTER;
2606                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2607                                 sizeof(d[0].mcast_ifn));
2608                         d[0].syncid = ipvs->master_syncid;
2609                 }
2610                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2611                         d[1].state = IP_VS_STATE_BACKUP;
2612                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2613                                 sizeof(d[1].mcast_ifn));
2614                         d[1].syncid = ipvs->backup_syncid;
2615                 }
2616                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2617                         ret = -EFAULT;
2618         }
2619         break;
2620
2621         default:
2622                 ret = -EINVAL;
2623         }
2624
2625   out:
2626         mutex_unlock(&__ip_vs_mutex);
2627         return ret;
2628 }
2629
2630
2631 static struct nf_sockopt_ops ip_vs_sockopts = {
2632         .pf             = PF_INET,
2633         .set_optmin     = IP_VS_BASE_CTL,
2634         .set_optmax     = IP_VS_SO_SET_MAX+1,
2635         .set            = do_ip_vs_set_ctl,
2636         .get_optmin     = IP_VS_BASE_CTL,
2637         .get_optmax     = IP_VS_SO_GET_MAX+1,
2638         .get            = do_ip_vs_get_ctl,
2639         .owner          = THIS_MODULE,
2640 };
2641
2642 /*
2643  * Generic Netlink interface
2644  */
2645
2646 /* IPVS genetlink family */
2647 static struct genl_family ip_vs_genl_family = {
2648         .id             = GENL_ID_GENERATE,
2649         .hdrsize        = 0,
2650         .name           = IPVS_GENL_NAME,
2651         .version        = IPVS_GENL_VERSION,
2652         .maxattr        = IPVS_CMD_MAX,
2653         .netnsok        = true,         /* Make ipvsadm to work on netns */
2654 };
2655
2656 /* Policy used for first-level command attributes */
2657 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2658         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2659         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2660         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2661         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2662         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2663         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2664 };
2665
2666 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2667 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2668         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2669         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2670                                             .len = IP_VS_IFNAME_MAXLEN },
2671         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2672 };
2673
2674 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2675 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2676         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2677         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2678         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2679                                             .len = sizeof(union nf_inet_addr) },
2680         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2681         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2682         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2683                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2684         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2685                                             .len = IP_VS_PENAME_MAXLEN },
2686         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2687                                             .len = sizeof(struct ip_vs_flags) },
2688         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2689         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2690         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2691 };
2692
2693 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2694 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2695         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2696                                             .len = sizeof(union nf_inet_addr) },
2697         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2698         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2699         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2700         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2701         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2702         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2703         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2704         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2705         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2706 };
2707
2708 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2709                                  struct ip_vs_stats *stats)
2710 {
2711         struct ip_vs_stats_user ustats;
2712         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2713         if (!nl_stats)
2714                 return -EMSGSIZE;
2715
2716         ip_vs_copy_stats(&ustats, stats);
2717
2718         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns);
2719         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts);
2720         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts);
2721         NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes);
2722         NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes);
2723         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps);
2724         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps);
2725         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps);
2726         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps);
2727         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps);
2728
2729         nla_nest_end(skb, nl_stats);
2730
2731         return 0;
2732
2733 nla_put_failure:
2734         nla_nest_cancel(skb, nl_stats);
2735         return -EMSGSIZE;
2736 }
2737
2738 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2739                                    struct ip_vs_service *svc)
2740 {
2741         struct nlattr *nl_service;
2742         struct ip_vs_flags flags = { .flags = svc->flags,
2743                                      .mask = ~0 };
2744
2745         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2746         if (!nl_service)
2747                 return -EMSGSIZE;
2748
2749         NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
2750
2751         if (svc->fwmark) {
2752                 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2753         } else {
2754                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2755                 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2756                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2757         }
2758
2759         NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2760         if (svc->pe)
2761                 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
2762         NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2763         NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2764         NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2765
2766         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2767                 goto nla_put_failure;
2768
2769         nla_nest_end(skb, nl_service);
2770
2771         return 0;
2772
2773 nla_put_failure:
2774         nla_nest_cancel(skb, nl_service);
2775         return -EMSGSIZE;
2776 }
2777
2778 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2779                                    struct ip_vs_service *svc,
2780                                    struct netlink_callback *cb)
2781 {
2782         void *hdr;
2783
2784         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2785                           &ip_vs_genl_family, NLM_F_MULTI,
2786                           IPVS_CMD_NEW_SERVICE);
2787         if (!hdr)
2788                 return -EMSGSIZE;
2789
2790         if (ip_vs_genl_fill_service(skb, svc) < 0)
2791                 goto nla_put_failure;
2792
2793         return genlmsg_end(skb, hdr);
2794
2795 nla_put_failure:
2796         genlmsg_cancel(skb, hdr);
2797         return -EMSGSIZE;
2798 }
2799
2800 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2801                                     struct netlink_callback *cb)
2802 {
2803         int idx = 0, i;
2804         int start = cb->args[0];
2805         struct ip_vs_service *svc;
2806         struct net *net = skb_sknet(skb);
2807
2808         mutex_lock(&__ip_vs_mutex);
2809         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2810                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2811                         if (++idx <= start || !net_eq(svc->net, net))
2812                                 continue;
2813                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2814                                 idx--;
2815                                 goto nla_put_failure;
2816                         }
2817                 }
2818         }
2819
2820         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2821                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2822                         if (++idx <= start || !net_eq(svc->net, net))
2823                                 continue;
2824                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2825                                 idx--;
2826                                 goto nla_put_failure;
2827                         }
2828                 }
2829         }
2830
2831 nla_put_failure:
2832         mutex_unlock(&__ip_vs_mutex);
2833         cb->args[0] = idx;
2834
2835         return skb->len;
2836 }
2837
2838 static int ip_vs_genl_parse_service(struct net *net,
2839                                     struct ip_vs_service_user_kern *usvc,
2840                                     struct nlattr *nla, int full_entry,
2841                                     struct ip_vs_service **ret_svc)
2842 {
2843         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2844         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2845         struct ip_vs_service *svc;
2846
2847         /* Parse mandatory identifying service fields first */
2848         if (nla == NULL ||
2849             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2850                 return -EINVAL;
2851
2852         nla_af          = attrs[IPVS_SVC_ATTR_AF];
2853         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
2854         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
2855         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
2856         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
2857
2858         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2859                 return -EINVAL;
2860
2861         memset(usvc, 0, sizeof(*usvc));
2862
2863         usvc->af = nla_get_u16(nla_af);
2864 #ifdef CONFIG_IP_VS_IPV6
2865         if (usvc->af != AF_INET && usvc->af != AF_INET6)
2866 #else
2867         if (usvc->af != AF_INET)
2868 #endif
2869                 return -EAFNOSUPPORT;
2870
2871         if (nla_fwmark) {
2872                 usvc->protocol = IPPROTO_TCP;
2873                 usvc->fwmark = nla_get_u32(nla_fwmark);
2874         } else {
2875                 usvc->protocol = nla_get_u16(nla_protocol);
2876                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2877                 usvc->port = nla_get_u16(nla_port);
2878                 usvc->fwmark = 0;
2879         }
2880
2881         if (usvc->fwmark)
2882                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
2883         else
2884                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
2885                                            &usvc->addr, usvc->port);
2886         *ret_svc = svc;
2887
2888         /* If a full entry was requested, check for the additional fields */
2889         if (full_entry) {
2890                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
2891                               *nla_netmask;
2892                 struct ip_vs_flags flags;
2893
2894                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2895                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
2896                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2897                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2898                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2899
2900                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2901                         return -EINVAL;
2902
2903                 nla_memcpy(&flags, nla_flags, sizeof(flags));
2904
2905                 /* prefill flags from service if it already exists */
2906                 if (svc)
2907                         usvc->flags = svc->flags;
2908
2909                 /* set new flags from userland */
2910                 usvc->flags = (usvc->flags & ~flags.mask) |
2911                               (flags.flags & flags.mask);
2912                 usvc->sched_name = nla_data(nla_sched);
2913                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
2914                 usvc->timeout = nla_get_u32(nla_timeout);
2915                 usvc->netmask = nla_get_u32(nla_netmask);
2916         }
2917
2918         return 0;
2919 }
2920
2921 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
2922                                                      struct nlattr *nla)
2923 {
2924         struct ip_vs_service_user_kern usvc;
2925         struct ip_vs_service *svc;
2926         int ret;
2927
2928         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
2929         return ret ? ERR_PTR(ret) : svc;
2930 }
2931
2932 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2933 {
2934         struct nlattr *nl_dest;
2935
2936         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2937         if (!nl_dest)
2938                 return -EMSGSIZE;
2939
2940         NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2941         NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2942
2943         NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2944                     atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2945         NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2946         NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2947         NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2948         NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2949                     atomic_read(&dest->activeconns));
2950         NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2951                     atomic_read(&dest->inactconns));
2952         NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2953                     atomic_read(&dest->persistconns));
2954
2955         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2956                 goto nla_put_failure;
2957
2958         nla_nest_end(skb, nl_dest);
2959
2960         return 0;
2961
2962 nla_put_failure:
2963         nla_nest_cancel(skb, nl_dest);
2964         return -EMSGSIZE;
2965 }
2966
2967 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2968                                 struct netlink_callback *cb)
2969 {
2970         void *hdr;
2971
2972         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2973                           &ip_vs_genl_family, NLM_F_MULTI,
2974                           IPVS_CMD_NEW_DEST);
2975         if (!hdr)
2976                 return -EMSGSIZE;
2977
2978         if (ip_vs_genl_fill_dest(skb, dest) < 0)
2979                 goto nla_put_failure;
2980
2981         return genlmsg_end(skb, hdr);
2982
2983 nla_put_failure:
2984         genlmsg_cancel(skb, hdr);
2985         return -EMSGSIZE;
2986 }
2987
2988 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2989                                  struct netlink_callback *cb)
2990 {
2991         int idx = 0;
2992         int start = cb->args[0];
2993         struct ip_vs_service *svc;
2994         struct ip_vs_dest *dest;
2995         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2996         struct net *net = skb_sknet(skb);
2997
2998         mutex_lock(&__ip_vs_mutex);
2999
3000         /* Try to find the service for which to dump destinations */
3001         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
3002                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
3003                 goto out_err;
3004
3005
3006         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3007         if (IS_ERR(svc) || svc == NULL)
3008                 goto out_err;
3009
3010         /* Dump the destinations */
3011         list_for_each_entry(dest, &svc->destinations, n_list) {
3012                 if (++idx <= start)
3013                         continue;
3014                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3015                         idx--;
3016                         goto nla_put_failure;
3017                 }
3018         }
3019
3020 nla_put_failure:
3021         cb->args[0] = idx;
3022
3023 out_err:
3024         mutex_unlock(&__ip_vs_mutex);
3025
3026         return skb->len;
3027 }
3028
3029 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3030                                  struct nlattr *nla, int full_entry)
3031 {
3032         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3033         struct nlattr *nla_addr, *nla_port;
3034
3035         /* Parse mandatory identifying destination fields first */
3036         if (nla == NULL ||
3037             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3038                 return -EINVAL;
3039
3040         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3041         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3042
3043         if (!(nla_addr && nla_port))
3044                 return -EINVAL;
3045
3046         memset(udest, 0, sizeof(*udest));
3047
3048         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3049         udest->port = nla_get_u16(nla_port);
3050
3051         /* If a full entry was requested, check for the additional fields */
3052         if (full_entry) {
3053                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3054                               *nla_l_thresh;
3055
3056                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3057                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3058                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3059                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3060
3061                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3062                         return -EINVAL;
3063
3064                 udest->conn_flags = nla_get_u32(nla_fwd)
3065                                     & IP_VS_CONN_F_FWD_MASK;
3066                 udest->weight = nla_get_u32(nla_weight);
3067                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3068                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3069         }
3070
3071         return 0;
3072 }
3073
3074 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3075                                   const char *mcast_ifn, __be32 syncid)
3076 {
3077         struct nlattr *nl_daemon;
3078
3079         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3080         if (!nl_daemon)
3081                 return -EMSGSIZE;
3082
3083         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
3084         NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
3085         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
3086
3087         nla_nest_end(skb, nl_daemon);
3088
3089         return 0;
3090
3091 nla_put_failure:
3092         nla_nest_cancel(skb, nl_daemon);
3093         return -EMSGSIZE;
3094 }
3095
3096 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3097                                   const char *mcast_ifn, __be32 syncid,
3098                                   struct netlink_callback *cb)
3099 {
3100         void *hdr;
3101         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
3102                           &ip_vs_genl_family, NLM_F_MULTI,
3103                           IPVS_CMD_NEW_DAEMON);
3104         if (!hdr)
3105                 return -EMSGSIZE;
3106
3107         if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3108                 goto nla_put_failure;
3109
3110         return genlmsg_end(skb, hdr);
3111
3112 nla_put_failure:
3113         genlmsg_cancel(skb, hdr);
3114         return -EMSGSIZE;
3115 }
3116
3117 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3118                                    struct netlink_callback *cb)
3119 {
3120         struct net *net = skb_sknet(skb);
3121         struct netns_ipvs *ipvs = net_ipvs(net);
3122
3123         mutex_lock(&__ip_vs_mutex);
3124         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3125                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3126                                            ipvs->master_mcast_ifn,
3127                                            ipvs->master_syncid, cb) < 0)
3128                         goto nla_put_failure;
3129
3130                 cb->args[0] = 1;
3131         }
3132
3133         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3134                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3135                                            ipvs->backup_mcast_ifn,
3136                                            ipvs->backup_syncid, cb) < 0)
3137                         goto nla_put_failure;
3138
3139                 cb->args[1] = 1;
3140         }
3141
3142 nla_put_failure:
3143         mutex_unlock(&__ip_vs_mutex);
3144
3145         return skb->len;
3146 }
3147
3148 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3149 {
3150         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3151               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3152               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3153                 return -EINVAL;
3154
3155         return start_sync_thread(net,
3156                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3157                                  nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3158                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3159 }
3160
3161 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3162 {
3163         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3164                 return -EINVAL;
3165
3166         return stop_sync_thread(net,
3167                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3168 }
3169
3170 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3171 {
3172         struct ip_vs_timeout_user t;
3173
3174         __ip_vs_get_timeouts(net, &t);
3175
3176         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3177                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3178
3179         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3180                 t.tcp_fin_timeout =
3181                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3182
3183         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3184                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3185
3186         return ip_vs_set_timeout(net, &t);
3187 }
3188
3189 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3190 {
3191         struct ip_vs_service *svc = NULL;
3192         struct ip_vs_service_user_kern usvc;
3193         struct ip_vs_dest_user_kern udest;
3194         int ret = 0, cmd;
3195         int need_full_svc = 0, need_full_dest = 0;
3196         struct net *net;
3197         struct netns_ipvs *ipvs;
3198
3199         net = skb_sknet(skb);
3200         ipvs = net_ipvs(net);
3201         cmd = info->genlhdr->cmd;
3202
3203         mutex_lock(&__ip_vs_mutex);
3204
3205         if (cmd == IPVS_CMD_FLUSH) {
3206                 ret = ip_vs_flush(net);
3207                 goto out;
3208         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3209                 ret = ip_vs_genl_set_config(net, info->attrs);
3210                 goto out;
3211         } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3212                    cmd == IPVS_CMD_DEL_DAEMON) {
3213
3214                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3215
3216                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3217                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3218                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3219                                      ip_vs_daemon_policy)) {
3220                         ret = -EINVAL;
3221                         goto out;
3222                 }
3223
3224                 if (cmd == IPVS_CMD_NEW_DAEMON)
3225                         ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3226                 else
3227                         ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3228                 goto out;
3229         } else if (cmd == IPVS_CMD_ZERO &&
3230                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3231                 ret = ip_vs_zero_all(net);
3232                 goto out;
3233         }
3234
3235         /* All following commands require a service argument, so check if we
3236          * received a valid one. We need a full service specification when
3237          * adding / editing a service. Only identifying members otherwise. */
3238         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3239                 need_full_svc = 1;
3240
3241         ret = ip_vs_genl_parse_service(net, &usvc,
3242                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3243                                        need_full_svc, &svc);
3244         if (ret)
3245                 goto out;
3246
3247         /* Unless we're adding a new service, the service must already exist */
3248         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3249                 ret = -ESRCH;
3250                 goto out;
3251         }
3252
3253         /* Destination commands require a valid destination argument. For
3254          * adding / editing a destination, we need a full destination
3255          * specification. */
3256         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3257             cmd == IPVS_CMD_DEL_DEST) {
3258                 if (cmd != IPVS_CMD_DEL_DEST)
3259                         need_full_dest = 1;
3260
3261                 ret = ip_vs_genl_parse_dest(&udest,
3262                                             info->attrs[IPVS_CMD_ATTR_DEST],
3263                                             need_full_dest);
3264                 if (ret)
3265                         goto out;
3266         }
3267
3268         switch (cmd) {
3269         case IPVS_CMD_NEW_SERVICE:
3270                 if (svc == NULL)
3271                         ret = ip_vs_add_service(net, &usvc, &svc);
3272                 else
3273                         ret = -EEXIST;
3274                 break;
3275         case IPVS_CMD_SET_SERVICE:
3276                 ret = ip_vs_edit_service(svc, &usvc);
3277                 break;
3278         case IPVS_CMD_DEL_SERVICE:
3279                 ret = ip_vs_del_service(svc);
3280                 /* do not use svc, it can be freed */
3281                 break;
3282         case IPVS_CMD_NEW_DEST:
3283                 ret = ip_vs_add_dest(svc, &udest);
3284                 break;
3285         case IPVS_CMD_SET_DEST:
3286                 ret = ip_vs_edit_dest(svc, &udest);
3287                 break;
3288         case IPVS_CMD_DEL_DEST:
3289                 ret = ip_vs_del_dest(svc, &udest);
3290                 break;
3291         case IPVS_CMD_ZERO:
3292                 ret = ip_vs_zero_service(svc);
3293                 break;
3294         default:
3295                 ret = -EINVAL;
3296         }
3297
3298 out:
3299         mutex_unlock(&__ip_vs_mutex);
3300
3301         return ret;
3302 }
3303
3304 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3305 {
3306         struct sk_buff *msg;
3307         void *reply;
3308         int ret, cmd, reply_cmd;
3309         struct net *net;
3310         struct netns_ipvs *ipvs;
3311
3312         net = skb_sknet(skb);
3313         ipvs = net_ipvs(net);
3314         cmd = info->genlhdr->cmd;
3315
3316         if (cmd == IPVS_CMD_GET_SERVICE)
3317                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3318         else if (cmd == IPVS_CMD_GET_INFO)
3319                 reply_cmd = IPVS_CMD_SET_INFO;
3320         else if (cmd == IPVS_CMD_GET_CONFIG)
3321                 reply_cmd = IPVS_CMD_SET_CONFIG;
3322         else {
3323                 pr_err("unknown Generic Netlink command\n");
3324                 return -EINVAL;
3325         }
3326
3327         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3328         if (!msg)
3329                 return -ENOMEM;
3330
3331         mutex_lock(&__ip_vs_mutex);
3332
3333         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3334         if (reply == NULL)
3335                 goto nla_put_failure;
3336
3337         switch (cmd) {
3338         case IPVS_CMD_GET_SERVICE:
3339         {
3340                 struct ip_vs_service *svc;
3341
3342                 svc = ip_vs_genl_find_service(net,
3343                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3344                 if (IS_ERR(svc)) {
3345                         ret = PTR_ERR(svc);
3346                         goto out_err;
3347                 } else if (svc) {
3348                         ret = ip_vs_genl_fill_service(msg, svc);
3349                         if (ret)
3350                                 goto nla_put_failure;
3351                 } else {
3352                         ret = -ESRCH;
3353                         goto out_err;
3354                 }
3355
3356                 break;
3357         }
3358
3359         case IPVS_CMD_GET_CONFIG:
3360         {
3361                 struct ip_vs_timeout_user t;
3362
3363                 __ip_vs_get_timeouts(net, &t);
3364 #ifdef CONFIG_IP_VS_PROTO_TCP
3365                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3366                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3367                             t.tcp_fin_timeout);
3368 #endif
3369 #ifdef CONFIG_IP_VS_PROTO_UDP
3370                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3371 #endif
3372
3373                 break;
3374         }
3375
3376         case IPVS_CMD_GET_INFO:
3377                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3378                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3379                             ip_vs_conn_tab_size);
3380                 break;
3381         }
3382
3383         genlmsg_end(msg, reply);
3384         ret = genlmsg_reply(msg, info);
3385         goto out;
3386
3387 nla_put_failure:
3388         pr_err("not enough space in Netlink message\n");
3389         ret = -EMSGSIZE;
3390
3391 out_err:
3392         nlmsg_free(msg);
3393 out:
3394         mutex_unlock(&__ip_vs_mutex);
3395
3396         return ret;
3397 }
3398
3399
3400 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3401         {
3402                 .cmd    = IPVS_CMD_NEW_SERVICE,
3403                 .flags  = GENL_ADMIN_PERM,
3404                 .policy = ip_vs_cmd_policy,
3405                 .doit   = ip_vs_genl_set_cmd,
3406         },
3407         {
3408                 .cmd    = IPVS_CMD_SET_SERVICE,
3409                 .flags  = GENL_ADMIN_PERM,
3410                 .policy = ip_vs_cmd_policy,
3411                 .doit   = ip_vs_genl_set_cmd,
3412         },
3413         {
3414                 .cmd    = IPVS_CMD_DEL_SERVICE,
3415                 .flags  = GENL_ADMIN_PERM,
3416                 .policy = ip_vs_cmd_policy,
3417                 .doit   = ip_vs_genl_set_cmd,
3418         },
3419         {
3420                 .cmd    = IPVS_CMD_GET_SERVICE,
3421                 .flags  = GENL_ADMIN_PERM,
3422                 .doit   = ip_vs_genl_get_cmd,
3423                 .dumpit = ip_vs_genl_dump_services,
3424                 .policy = ip_vs_cmd_policy,
3425         },
3426         {
3427                 .cmd    = IPVS_CMD_NEW_DEST,
3428                 .flags  = GENL_ADMIN_PERM,
3429                 .policy = ip_vs_cmd_policy,
3430                 .doit   = ip_vs_genl_set_cmd,
3431         },
3432         {
3433                 .cmd    = IPVS_CMD_SET_DEST,
3434                 .flags  = GENL_ADMIN_PERM,
3435                 .policy = ip_vs_cmd_policy,
3436                 .doit   = ip_vs_genl_set_cmd,
3437         },
3438         {
3439                 .cmd    = IPVS_CMD_DEL_DEST,
3440                 .flags  = GENL_ADMIN_PERM,
3441                 .policy = ip_vs_cmd_policy,
3442                 .doit   = ip_vs_genl_set_cmd,
3443         },
3444         {
3445                 .cmd    = IPVS_CMD_GET_DEST,
3446                 .flags  = GENL_ADMIN_PERM,
3447                 .policy = ip_vs_cmd_policy,
3448                 .dumpit = ip_vs_genl_dump_dests,
3449         },
3450         {
3451                 .cmd    = IPVS_CMD_NEW_DAEMON,
3452                 .flags  = GENL_ADMIN_PERM,
3453                 .policy = ip_vs_cmd_policy,
3454                 .doit   = ip_vs_genl_set_cmd,
3455         },
3456         {
3457                 .cmd    = IPVS_CMD_DEL_DAEMON,
3458                 .flags  = GENL_ADMIN_PERM,
3459                 .policy = ip_vs_cmd_policy,
3460                 .doit   = ip_vs_genl_set_cmd,
3461         },
3462         {
3463                 .cmd    = IPVS_CMD_GET_DAEMON,
3464                 .flags  = GENL_ADMIN_PERM,
3465                 .dumpit = ip_vs_genl_dump_daemons,
3466         },
3467         {
3468                 .cmd    = IPVS_CMD_SET_CONFIG,
3469                 .flags  = GENL_ADMIN_PERM,
3470                 .policy = ip_vs_cmd_policy,
3471                 .doit   = ip_vs_genl_set_cmd,
3472         },
3473         {
3474                 .cmd    = IPVS_CMD_GET_CONFIG,
3475                 .flags  = GENL_ADMIN_PERM,
3476                 .doit   = ip_vs_genl_get_cmd,
3477         },
3478         {
3479                 .cmd    = IPVS_CMD_GET_INFO,
3480                 .flags  = GENL_ADMIN_PERM,
3481                 .doit   = ip_vs_genl_get_cmd,
3482         },
3483         {
3484                 .cmd    = IPVS_CMD_ZERO,
3485                 .flags  = GENL_ADMIN_PERM,
3486                 .policy = ip_vs_cmd_policy,
3487                 .doit   = ip_vs_genl_set_cmd,
3488         },
3489         {
3490                 .cmd    = IPVS_CMD_FLUSH,
3491                 .flags  = GENL_ADMIN_PERM,
3492                 .doit   = ip_vs_genl_set_cmd,
3493         },
3494 };
3495
3496 static int __init ip_vs_genl_register(void)
3497 {
3498         return genl_register_family_with_ops(&ip_vs_genl_family,
3499                 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3500 }
3501
3502 static void ip_vs_genl_unregister(void)
3503 {
3504         genl_unregister_family(&ip_vs_genl_family);
3505 }
3506
3507 /* End of Generic Netlink interface definitions */
3508
3509 /*
3510  * per netns intit/exit func.
3511  */
3512 #ifdef CONFIG_SYSCTL
3513 int __net_init __ip_vs_control_init_sysctl(struct net *net)
3514 {
3515         int idx;
3516         struct netns_ipvs *ipvs = net_ipvs(net);
3517         struct ctl_table *tbl;
3518
3519         atomic_set(&ipvs->dropentry, 0);
3520         spin_lock_init(&ipvs->dropentry_lock);
3521         spin_lock_init(&ipvs->droppacket_lock);
3522         spin_lock_init(&ipvs->securetcp_lock);
3523
3524         if (!net_eq(net, &init_net)) {
3525                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3526                 if (tbl == NULL)
3527                         return -ENOMEM;
3528         } else
3529                 tbl = vs_vars;
3530         /* Initialize sysctl defaults */
3531         idx = 0;
3532         ipvs->sysctl_amemthresh = 1024;
3533         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3534         ipvs->sysctl_am_droprate = 10;
3535         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3536         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3537         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3538 #ifdef CONFIG_IP_VS_NFCT
3539         tbl[idx++].data = &ipvs->sysctl_conntrack;
3540 #endif
3541         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3542         ipvs->sysctl_snat_reroute = 1;
3543         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3544         ipvs->sysctl_sync_ver = 1;
3545         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3546         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3547         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3548         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3549         ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
3550         ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
3551         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3552         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3553         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3554
3555
3556         ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
3557                                                      tbl);
3558         if (ipvs->sysctl_hdr == NULL) {
3559                 if (!net_eq(net, &init_net))
3560                         kfree(tbl);
3561                 return -ENOMEM;
3562         }
3563         ip_vs_start_estimator(net, &ipvs->tot_stats);
3564         ipvs->sysctl_tbl = tbl;
3565         /* Schedule defense work */
3566         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3567         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3568
3569         return 0;
3570 }
3571
3572 void __net_init __ip_vs_control_cleanup_sysctl(struct net *net)
3573 {
3574         struct netns_ipvs *ipvs = net_ipvs(net);
3575
3576         cancel_delayed_work_sync(&ipvs->defense_work);
3577         cancel_work_sync(&ipvs->defense_work.work);
3578         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3579 }
3580
3581 #else
3582
3583 int __net_init __ip_vs_control_init_sysctl(struct net *net) { return 0; }
3584 void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { }
3585
3586 #endif
3587
3588 int __net_init __ip_vs_control_init(struct net *net)
3589 {
3590         int idx;
3591         struct netns_ipvs *ipvs = net_ipvs(net);
3592
3593         ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
3594
3595         /* Initialize rs_table */
3596         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3597                 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3598
3599         INIT_LIST_HEAD(&ipvs->dest_trash);
3600         atomic_set(&ipvs->ftpsvc_counter, 0);
3601         atomic_set(&ipvs->nullsvc_counter, 0);
3602
3603         /* procfs stats */
3604         ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3605         if (!ipvs->tot_stats.cpustats) {
3606                 pr_err("%s(): alloc_percpu.\n", __func__);
3607                 return -ENOMEM;
3608         }
3609         spin_lock_init(&ipvs->tot_stats.lock);
3610
3611         proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
3612         proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
3613         proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
3614                              &ip_vs_stats_percpu_fops);
3615
3616         if (__ip_vs_control_init_sysctl(net))
3617                 goto err;
3618
3619         return 0;
3620
3621 err:
3622         free_percpu(ipvs->tot_stats.cpustats);
3623         return -ENOMEM;
3624 }
3625
3626 static void __net_exit __ip_vs_control_cleanup(struct net *net)
3627 {
3628         struct netns_ipvs *ipvs = net_ipvs(net);
3629
3630         ip_vs_trash_cleanup(net);
3631         ip_vs_stop_estimator(net, &ipvs->tot_stats);
3632         __ip_vs_control_cleanup_sysctl(net);
3633         proc_net_remove(net, "ip_vs_stats_percpu");
3634         proc_net_remove(net, "ip_vs_stats");
3635         proc_net_remove(net, "ip_vs");
3636         free_percpu(ipvs->tot_stats.cpustats);
3637 }
3638
3639 static struct pernet_operations ipvs_control_ops = {
3640         .init = __ip_vs_control_init,
3641         .exit = __ip_vs_control_cleanup,
3642 };
3643
3644 int __init ip_vs_control_init(void)
3645 {
3646         int idx;
3647         int ret;
3648
3649         EnterFunction(2);
3650
3651         /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3652         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
3653                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3654                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3655         }
3656
3657         ret = register_pernet_subsys(&ipvs_control_ops);
3658         if (ret) {
3659                 pr_err("cannot register namespace.\n");
3660                 goto err;
3661         }
3662
3663         smp_wmb();      /* Do we really need it now ? */
3664
3665         ret = nf_register_sockopt(&ip_vs_sockopts);
3666         if (ret) {
3667                 pr_err("cannot register sockopt.\n");
3668                 goto err_net;
3669         }
3670
3671         ret = ip_vs_genl_register();
3672         if (ret) {
3673                 pr_err("cannot register Generic Netlink interface.\n");
3674                 nf_unregister_sockopt(&ip_vs_sockopts);
3675                 goto err_net;
3676         }
3677
3678         LeaveFunction(2);
3679         return 0;
3680
3681 err_net:
3682         unregister_pernet_subsys(&ipvs_control_ops);
3683 err:
3684         return ret;
3685 }
3686
3687
3688 void ip_vs_control_cleanup(void)
3689 {
3690         EnterFunction(2);
3691         unregister_pernet_subsys(&ipvs_control_ops);
3692         ip_vs_genl_unregister();
3693         nf_unregister_sockopt(&ip_vs_sockopts);
3694         LeaveFunction(2);
3695 }