]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - ipc/shm.c
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[karo-tx-linux.git] / ipc / shm.c
1 /*
2  * linux/ipc/shm.c
3  * Copyright (C) 1992, 1993 Krishna Balasubramanian
4  *       Many improvements/fixes by Bruno Haible.
5  * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
6  * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
7  *
8  * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
9  * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
10  * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
11  * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
12  * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
13  * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
14  * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
15  *
16  * support for audit of ipc object properties and permission changes
17  * Dustin Kirkland <dustin.kirkland@us.ibm.com>
18  *
19  * namespaces support
20  * OpenVZ, SWsoft Inc.
21  * Pavel Emelianov <xemul@openvz.org>
22  *
23  * Better ipc lock (kern_ipc_perm.lock) handling
24  * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
25  */
26
27 #include <linux/slab.h>
28 #include <linux/mm.h>
29 #include <linux/hugetlb.h>
30 #include <linux/shm.h>
31 #include <linux/init.h>
32 #include <linux/file.h>
33 #include <linux/mman.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/security.h>
36 #include <linux/syscalls.h>
37 #include <linux/audit.h>
38 #include <linux/capability.h>
39 #include <linux/ptrace.h>
40 #include <linux/seq_file.h>
41 #include <linux/rwsem.h>
42 #include <linux/nsproxy.h>
43 #include <linux/mount.h>
44 #include <linux/ipc_namespace.h>
45
46 #include <linux/uaccess.h>
47
48 #include "util.h"
49
50 struct shm_file_data {
51         int id;
52         struct ipc_namespace *ns;
53         struct file *file;
54         const struct vm_operations_struct *vm_ops;
55 };
56
57 #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
58
59 static const struct file_operations shm_file_operations;
60 static const struct vm_operations_struct shm_vm_ops;
61
62 #define shm_ids(ns)     ((ns)->ids[IPC_SHM_IDS])
63
64 #define shm_unlock(shp)                 \
65         ipc_unlock(&(shp)->shm_perm)
66
67 static int newseg(struct ipc_namespace *, struct ipc_params *);
68 static void shm_open(struct vm_area_struct *vma);
69 static void shm_close(struct vm_area_struct *vma);
70 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
71 #ifdef CONFIG_PROC_FS
72 static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
73 #endif
74
75 void shm_init_ns(struct ipc_namespace *ns)
76 {
77         ns->shm_ctlmax = SHMMAX;
78         ns->shm_ctlall = SHMALL;
79         ns->shm_ctlmni = SHMMNI;
80         ns->shm_rmid_forced = 0;
81         ns->shm_tot = 0;
82         ipc_init_ids(&shm_ids(ns));
83 }
84
85 /*
86  * Called with shm_ids.rwsem (writer) and the shp structure locked.
87  * Only shm_ids.rwsem remains locked on exit.
88  */
89 static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
90 {
91         struct shmid_kernel *shp;
92         shp = container_of(ipcp, struct shmid_kernel, shm_perm);
93
94         if (shp->shm_nattch) {
95                 shp->shm_perm.mode |= SHM_DEST;
96                 /* Do not find it any more */
97                 shp->shm_perm.key = IPC_PRIVATE;
98                 shm_unlock(shp);
99         } else
100                 shm_destroy(ns, shp);
101 }
102
103 #ifdef CONFIG_IPC_NS
104 void shm_exit_ns(struct ipc_namespace *ns)
105 {
106         free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
107         idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
108 }
109 #endif
110
111 static int __init ipc_ns_init(void)
112 {
113         shm_init_ns(&init_ipc_ns);
114         return 0;
115 }
116
117 pure_initcall(ipc_ns_init);
118
119 void __init shm_init(void)
120 {
121         ipc_init_proc_interface("sysvipc/shm",
122 #if BITS_PER_LONG <= 32
123                                 "       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
124 #else
125                                 "       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
126 #endif
127                                 IPC_SHM_IDS, sysvipc_shm_proc_show);
128 }
129
130 static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
131 {
132         struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&shm_ids(ns), id);
133
134         if (IS_ERR(ipcp))
135                 return ERR_CAST(ipcp);
136
137         return container_of(ipcp, struct shmid_kernel, shm_perm);
138 }
139
140 static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
141 {
142         struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);
143
144         if (IS_ERR(ipcp))
145                 return ERR_CAST(ipcp);
146
147         return container_of(ipcp, struct shmid_kernel, shm_perm);
148 }
149
150 /*
151  * shm_lock_(check_) routines are called in the paths where the rwsem
152  * is not necessarily held.
153  */
154 static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
155 {
156         struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
157
158         /*
159          * We raced in the idr lookup or with shm_destroy().  Either way, the
160          * ID is busted.
161          */
162         WARN_ON(IS_ERR(ipcp));
163
164         return container_of(ipcp, struct shmid_kernel, shm_perm);
165 }
166
167 static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
168 {
169         rcu_read_lock();
170         ipc_lock_object(&ipcp->shm_perm);
171 }
172
173 static void shm_rcu_free(struct rcu_head *head)
174 {
175         struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
176         struct shmid_kernel *shp = ipc_rcu_to_struct(p);
177
178         security_shm_free(shp);
179         ipc_rcu_free(head);
180 }
181
182 static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
183 {
184         list_del(&s->shm_clist);
185         ipc_rmid(&shm_ids(ns), &s->shm_perm);
186 }
187
188
189 /* This is called by fork, once for every shm attach. */
190 static void shm_open(struct vm_area_struct *vma)
191 {
192         struct file *file = vma->vm_file;
193         struct shm_file_data *sfd = shm_file_data(file);
194         struct shmid_kernel *shp;
195
196         shp = shm_lock(sfd->ns, sfd->id);
197         shp->shm_atim = get_seconds();
198         shp->shm_lprid = task_tgid_vnr(current);
199         shp->shm_nattch++;
200         shm_unlock(shp);
201 }
202
203 /*
204  * shm_destroy - free the struct shmid_kernel
205  *
206  * @ns: namespace
207  * @shp: struct to free
208  *
209  * It has to be called with shp and shm_ids.rwsem (writer) locked,
210  * but returns with shp unlocked and freed.
211  */
212 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
213 {
214         struct file *shm_file;
215
216         shm_file = shp->shm_file;
217         shp->shm_file = NULL;
218         ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
219         shm_rmid(ns, shp);
220         shm_unlock(shp);
221         if (!is_file_hugepages(shm_file))
222                 shmem_lock(shm_file, 0, shp->mlock_user);
223         else if (shp->mlock_user)
224                 user_shm_unlock(i_size_read(file_inode(shm_file)),
225                                 shp->mlock_user);
226         fput(shm_file);
227         ipc_rcu_putref(shp, shm_rcu_free);
228 }
229
230 /*
231  * shm_may_destroy - identifies whether shm segment should be destroyed now
232  *
233  * Returns true if and only if there are no active users of the segment and
234  * one of the following is true:
235  *
236  * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
237  *
238  * 2) sysctl kernel.shm_rmid_forced is set to 1.
239  */
240 static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
241 {
242         return (shp->shm_nattch == 0) &&
243                (ns->shm_rmid_forced ||
244                 (shp->shm_perm.mode & SHM_DEST));
245 }
246
247 /*
248  * remove the attach descriptor vma.
249  * free memory for segment if it is marked destroyed.
250  * The descriptor has already been removed from the current->mm->mmap list
251  * and will later be kfree()d.
252  */
253 static void shm_close(struct vm_area_struct *vma)
254 {
255         struct file *file = vma->vm_file;
256         struct shm_file_data *sfd = shm_file_data(file);
257         struct shmid_kernel *shp;
258         struct ipc_namespace *ns = sfd->ns;
259
260         down_write(&shm_ids(ns).rwsem);
261         /* remove from the list of attaches of the shm segment */
262         shp = shm_lock(ns, sfd->id);
263         shp->shm_lprid = task_tgid_vnr(current);
264         shp->shm_dtim = get_seconds();
265         shp->shm_nattch--;
266         if (shm_may_destroy(ns, shp))
267                 shm_destroy(ns, shp);
268         else
269                 shm_unlock(shp);
270         up_write(&shm_ids(ns).rwsem);
271 }
272
273 /* Called with ns->shm_ids(ns).rwsem locked */
274 static int shm_try_destroy_orphaned(int id, void *p, void *data)
275 {
276         struct ipc_namespace *ns = data;
277         struct kern_ipc_perm *ipcp = p;
278         struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
279
280         /*
281          * We want to destroy segments without users and with already
282          * exit'ed originating process.
283          *
284          * As shp->* are changed under rwsem, it's safe to skip shp locking.
285          */
286         if (shp->shm_creator != NULL)
287                 return 0;
288
289         if (shm_may_destroy(ns, shp)) {
290                 shm_lock_by_ptr(shp);
291                 shm_destroy(ns, shp);
292         }
293         return 0;
294 }
295
296 void shm_destroy_orphaned(struct ipc_namespace *ns)
297 {
298         down_write(&shm_ids(ns).rwsem);
299         if (shm_ids(ns).in_use)
300                 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
301         up_write(&shm_ids(ns).rwsem);
302 }
303
304 /* Locking assumes this will only be called with task == current */
305 void exit_shm(struct task_struct *task)
306 {
307         struct ipc_namespace *ns = task->nsproxy->ipc_ns;
308         struct shmid_kernel *shp, *n;
309
310         if (list_empty(&task->sysvshm.shm_clist))
311                 return;
312
313         /*
314          * If kernel.shm_rmid_forced is not set then only keep track of
315          * which shmids are orphaned, so that a later set of the sysctl
316          * can clean them up.
317          */
318         if (!ns->shm_rmid_forced) {
319                 down_read(&shm_ids(ns).rwsem);
320                 list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
321                         shp->shm_creator = NULL;
322                 /*
323                  * Only under read lock but we are only called on current
324                  * so no entry on the list will be shared.
325                  */
326                 list_del(&task->sysvshm.shm_clist);
327                 up_read(&shm_ids(ns).rwsem);
328                 return;
329         }
330
331         /*
332          * Destroy all already created segments, that were not yet mapped,
333          * and mark any mapped as orphan to cover the sysctl toggling.
334          * Destroy is skipped if shm_may_destroy() returns false.
335          */
336         down_write(&shm_ids(ns).rwsem);
337         list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
338                 shp->shm_creator = NULL;
339
340                 if (shm_may_destroy(ns, shp)) {
341                         shm_lock_by_ptr(shp);
342                         shm_destroy(ns, shp);
343                 }
344         }
345
346         /* Remove the list head from any segments still attached. */
347         list_del(&task->sysvshm.shm_clist);
348         up_write(&shm_ids(ns).rwsem);
349 }
350
351 static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
352 {
353         struct file *file = vma->vm_file;
354         struct shm_file_data *sfd = shm_file_data(file);
355
356         return sfd->vm_ops->fault(vma, vmf);
357 }
358
359 #ifdef CONFIG_NUMA
360 static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
361 {
362         struct file *file = vma->vm_file;
363         struct shm_file_data *sfd = shm_file_data(file);
364         int err = 0;
365         if (sfd->vm_ops->set_policy)
366                 err = sfd->vm_ops->set_policy(vma, new);
367         return err;
368 }
369
370 static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
371                                         unsigned long addr)
372 {
373         struct file *file = vma->vm_file;
374         struct shm_file_data *sfd = shm_file_data(file);
375         struct mempolicy *pol = NULL;
376
377         if (sfd->vm_ops->get_policy)
378                 pol = sfd->vm_ops->get_policy(vma, addr);
379         else if (vma->vm_policy)
380                 pol = vma->vm_policy;
381
382         return pol;
383 }
384 #endif
385
386 static int shm_mmap(struct file *file, struct vm_area_struct *vma)
387 {
388         struct shm_file_data *sfd = shm_file_data(file);
389         int ret;
390
391         ret = sfd->file->f_op->mmap(sfd->file, vma);
392         if (ret != 0)
393                 return ret;
394         sfd->vm_ops = vma->vm_ops;
395 #ifdef CONFIG_MMU
396         WARN_ON(!sfd->vm_ops->fault);
397 #endif
398         vma->vm_ops = &shm_vm_ops;
399         shm_open(vma);
400
401         return ret;
402 }
403
404 static int shm_release(struct inode *ino, struct file *file)
405 {
406         struct shm_file_data *sfd = shm_file_data(file);
407
408         put_ipc_ns(sfd->ns);
409         shm_file_data(file) = NULL;
410         kfree(sfd);
411         return 0;
412 }
413
414 static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
415 {
416         struct shm_file_data *sfd = shm_file_data(file);
417
418         if (!sfd->file->f_op->fsync)
419                 return -EINVAL;
420         return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
421 }
422
423 static long shm_fallocate(struct file *file, int mode, loff_t offset,
424                           loff_t len)
425 {
426         struct shm_file_data *sfd = shm_file_data(file);
427
428         if (!sfd->file->f_op->fallocate)
429                 return -EOPNOTSUPP;
430         return sfd->file->f_op->fallocate(file, mode, offset, len);
431 }
432
433 static unsigned long shm_get_unmapped_area(struct file *file,
434         unsigned long addr, unsigned long len, unsigned long pgoff,
435         unsigned long flags)
436 {
437         struct shm_file_data *sfd = shm_file_data(file);
438         return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
439                                                 pgoff, flags);
440 }
441
442 static const struct file_operations shm_file_operations = {
443         .mmap           = shm_mmap,
444         .fsync          = shm_fsync,
445         .release        = shm_release,
446 #ifndef CONFIG_MMU
447         .get_unmapped_area      = shm_get_unmapped_area,
448 #endif
449         .llseek         = noop_llseek,
450         .fallocate      = shm_fallocate,
451 };
452
453 static const struct file_operations shm_file_operations_huge = {
454         .mmap           = shm_mmap,
455         .fsync          = shm_fsync,
456         .release        = shm_release,
457         .get_unmapped_area      = shm_get_unmapped_area,
458         .llseek         = noop_llseek,
459         .fallocate      = shm_fallocate,
460 };
461
462 int is_file_shm_hugepages(struct file *file)
463 {
464         return file->f_op == &shm_file_operations_huge;
465 }
466
467 static const struct vm_operations_struct shm_vm_ops = {
468         .open   = shm_open,     /* callback for a new vm-area open */
469         .close  = shm_close,    /* callback for when the vm-area is released */
470         .fault  = shm_fault,
471 #if defined(CONFIG_NUMA)
472         .set_policy = shm_set_policy,
473         .get_policy = shm_get_policy,
474 #endif
475 };
476
477 /**
478  * newseg - Create a new shared memory segment
479  * @ns: namespace
480  * @params: ptr to the structure that contains key, size and shmflg
481  *
482  * Called with shm_ids.rwsem held as a writer.
483  */
484 static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
485 {
486         key_t key = params->key;
487         int shmflg = params->flg;
488         size_t size = params->u.size;
489         int error;
490         struct shmid_kernel *shp;
491         size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
492         struct file *file;
493         char name[13];
494         int id;
495         vm_flags_t acctflag = 0;
496
497         if (size < SHMMIN || size > ns->shm_ctlmax)
498                 return -EINVAL;
499
500         if (numpages << PAGE_SHIFT < size)
501                 return -ENOSPC;
502
503         if (ns->shm_tot + numpages < ns->shm_tot ||
504                         ns->shm_tot + numpages > ns->shm_ctlall)
505                 return -ENOSPC;
506
507         shp = ipc_rcu_alloc(sizeof(*shp));
508         if (!shp)
509                 return -ENOMEM;
510
511         shp->shm_perm.key = key;
512         shp->shm_perm.mode = (shmflg & S_IRWXUGO);
513         shp->mlock_user = NULL;
514
515         shp->shm_perm.security = NULL;
516         error = security_shm_alloc(shp);
517         if (error) {
518                 ipc_rcu_putref(shp, ipc_rcu_free);
519                 return error;
520         }
521
522         sprintf(name, "SYSV%08x", key);
523         if (shmflg & SHM_HUGETLB) {
524                 struct hstate *hs;
525                 size_t hugesize;
526
527                 hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
528                 if (!hs) {
529                         error = -EINVAL;
530                         goto no_file;
531                 }
532                 hugesize = ALIGN(size, huge_page_size(hs));
533
534                 /* hugetlb_file_setup applies strict accounting */
535                 if (shmflg & SHM_NORESERVE)
536                         acctflag = VM_NORESERVE;
537                 file = hugetlb_file_setup(name, hugesize, acctflag,
538                                   &shp->mlock_user, HUGETLB_SHMFS_INODE,
539                                 (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
540         } else {
541                 /*
542                  * Do not allow no accounting for OVERCOMMIT_NEVER, even
543                  * if it's asked for.
544                  */
545                 if  ((shmflg & SHM_NORESERVE) &&
546                                 sysctl_overcommit_memory != OVERCOMMIT_NEVER)
547                         acctflag = VM_NORESERVE;
548                 file = shmem_kernel_file_setup(name, size, acctflag);
549         }
550         error = PTR_ERR(file);
551         if (IS_ERR(file))
552                 goto no_file;
553
554         shp->shm_cprid = task_tgid_vnr(current);
555         shp->shm_lprid = 0;
556         shp->shm_atim = shp->shm_dtim = 0;
557         shp->shm_ctim = get_seconds();
558         shp->shm_segsz = size;
559         shp->shm_nattch = 0;
560         shp->shm_file = file;
561         shp->shm_creator = current;
562
563         id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
564         if (id < 0) {
565                 error = id;
566                 goto no_id;
567         }
568
569         list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
570
571         /*
572          * shmid gets reported as "inode#" in /proc/pid/maps.
573          * proc-ps tools use this. Changing this will break them.
574          */
575         file_inode(file)->i_ino = shp->shm_perm.id;
576
577         ns->shm_tot += numpages;
578         error = shp->shm_perm.id;
579
580         ipc_unlock_object(&shp->shm_perm);
581         rcu_read_unlock();
582         return error;
583
584 no_id:
585         if (is_file_hugepages(file) && shp->mlock_user)
586                 user_shm_unlock(size, shp->mlock_user);
587         fput(file);
588 no_file:
589         ipc_rcu_putref(shp, shm_rcu_free);
590         return error;
591 }
592
593 /*
594  * Called with shm_ids.rwsem and ipcp locked.
595  */
596 static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
597 {
598         struct shmid_kernel *shp;
599
600         shp = container_of(ipcp, struct shmid_kernel, shm_perm);
601         return security_shm_associate(shp, shmflg);
602 }
603
604 /*
605  * Called with shm_ids.rwsem and ipcp locked.
606  */
607 static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
608                                 struct ipc_params *params)
609 {
610         struct shmid_kernel *shp;
611
612         shp = container_of(ipcp, struct shmid_kernel, shm_perm);
613         if (shp->shm_segsz < params->u.size)
614                 return -EINVAL;
615
616         return 0;
617 }
618
619 SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
620 {
621         struct ipc_namespace *ns;
622         static const struct ipc_ops shm_ops = {
623                 .getnew = newseg,
624                 .associate = shm_security,
625                 .more_checks = shm_more_checks,
626         };
627         struct ipc_params shm_params;
628
629         ns = current->nsproxy->ipc_ns;
630
631         shm_params.key = key;
632         shm_params.flg = shmflg;
633         shm_params.u.size = size;
634
635         return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
636 }
637
638 static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
639 {
640         switch (version) {
641         case IPC_64:
642                 return copy_to_user(buf, in, sizeof(*in));
643         case IPC_OLD:
644             {
645                 struct shmid_ds out;
646
647                 memset(&out, 0, sizeof(out));
648                 ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
649                 out.shm_segsz   = in->shm_segsz;
650                 out.shm_atime   = in->shm_atime;
651                 out.shm_dtime   = in->shm_dtime;
652                 out.shm_ctime   = in->shm_ctime;
653                 out.shm_cpid    = in->shm_cpid;
654                 out.shm_lpid    = in->shm_lpid;
655                 out.shm_nattch  = in->shm_nattch;
656
657                 return copy_to_user(buf, &out, sizeof(out));
658             }
659         default:
660                 return -EINVAL;
661         }
662 }
663
664 static inline unsigned long
665 copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
666 {
667         switch (version) {
668         case IPC_64:
669                 if (copy_from_user(out, buf, sizeof(*out)))
670                         return -EFAULT;
671                 return 0;
672         case IPC_OLD:
673             {
674                 struct shmid_ds tbuf_old;
675
676                 if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
677                         return -EFAULT;
678
679                 out->shm_perm.uid       = tbuf_old.shm_perm.uid;
680                 out->shm_perm.gid       = tbuf_old.shm_perm.gid;
681                 out->shm_perm.mode      = tbuf_old.shm_perm.mode;
682
683                 return 0;
684             }
685         default:
686                 return -EINVAL;
687         }
688 }
689
690 static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
691 {
692         switch (version) {
693         case IPC_64:
694                 return copy_to_user(buf, in, sizeof(*in));
695         case IPC_OLD:
696             {
697                 struct shminfo out;
698
699                 if (in->shmmax > INT_MAX)
700                         out.shmmax = INT_MAX;
701                 else
702                         out.shmmax = (int)in->shmmax;
703
704                 out.shmmin      = in->shmmin;
705                 out.shmmni      = in->shmmni;
706                 out.shmseg      = in->shmseg;
707                 out.shmall      = in->shmall;
708
709                 return copy_to_user(buf, &out, sizeof(out));
710             }
711         default:
712                 return -EINVAL;
713         }
714 }
715
716 /*
717  * Calculate and add used RSS and swap pages of a shm.
718  * Called with shm_ids.rwsem held as a reader
719  */
720 static void shm_add_rss_swap(struct shmid_kernel *shp,
721         unsigned long *rss_add, unsigned long *swp_add)
722 {
723         struct inode *inode;
724
725         inode = file_inode(shp->shm_file);
726
727         if (is_file_hugepages(shp->shm_file)) {
728                 struct address_space *mapping = inode->i_mapping;
729                 struct hstate *h = hstate_file(shp->shm_file);
730                 *rss_add += pages_per_huge_page(h) * mapping->nrpages;
731         } else {
732 #ifdef CONFIG_SHMEM
733                 struct shmem_inode_info *info = SHMEM_I(inode);
734                 spin_lock(&info->lock);
735                 *rss_add += inode->i_mapping->nrpages;
736                 *swp_add += info->swapped;
737                 spin_unlock(&info->lock);
738 #else
739                 *rss_add += inode->i_mapping->nrpages;
740 #endif
741         }
742 }
743
744 /*
745  * Called with shm_ids.rwsem held as a reader
746  */
747 static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
748                 unsigned long *swp)
749 {
750         int next_id;
751         int total, in_use;
752
753         *rss = 0;
754         *swp = 0;
755
756         in_use = shm_ids(ns).in_use;
757
758         for (total = 0, next_id = 0; total < in_use; next_id++) {
759                 struct kern_ipc_perm *ipc;
760                 struct shmid_kernel *shp;
761
762                 ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
763                 if (ipc == NULL)
764                         continue;
765                 shp = container_of(ipc, struct shmid_kernel, shm_perm);
766
767                 shm_add_rss_swap(shp, rss, swp);
768
769                 total++;
770         }
771 }
772
773 /*
774  * This function handles some shmctl commands which require the rwsem
775  * to be held in write mode.
776  * NOTE: no locks must be held, the rwsem is taken inside this function.
777  */
778 static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
779                        struct shmid_ds __user *buf, int version)
780 {
781         struct kern_ipc_perm *ipcp;
782         struct shmid64_ds shmid64;
783         struct shmid_kernel *shp;
784         int err;
785
786         if (cmd == IPC_SET) {
787                 if (copy_shmid_from_user(&shmid64, buf, version))
788                         return -EFAULT;
789         }
790
791         down_write(&shm_ids(ns).rwsem);
792         rcu_read_lock();
793
794         ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
795                                       &shmid64.shm_perm, 0);
796         if (IS_ERR(ipcp)) {
797                 err = PTR_ERR(ipcp);
798                 goto out_unlock1;
799         }
800
801         shp = container_of(ipcp, struct shmid_kernel, shm_perm);
802
803         err = security_shm_shmctl(shp, cmd);
804         if (err)
805                 goto out_unlock1;
806
807         switch (cmd) {
808         case IPC_RMID:
809                 ipc_lock_object(&shp->shm_perm);
810                 /* do_shm_rmid unlocks the ipc object and rcu */
811                 do_shm_rmid(ns, ipcp);
812                 goto out_up;
813         case IPC_SET:
814                 ipc_lock_object(&shp->shm_perm);
815                 err = ipc_update_perm(&shmid64.shm_perm, ipcp);
816                 if (err)
817                         goto out_unlock0;
818                 shp->shm_ctim = get_seconds();
819                 break;
820         default:
821                 err = -EINVAL;
822                 goto out_unlock1;
823         }
824
825 out_unlock0:
826         ipc_unlock_object(&shp->shm_perm);
827 out_unlock1:
828         rcu_read_unlock();
829 out_up:
830         up_write(&shm_ids(ns).rwsem);
831         return err;
832 }
833
834 static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
835                          int cmd, int version, void __user *buf)
836 {
837         int err;
838         struct shmid_kernel *shp;
839
840         /* preliminary security checks for *_INFO */
841         if (cmd == IPC_INFO || cmd == SHM_INFO) {
842                 err = security_shm_shmctl(NULL, cmd);
843                 if (err)
844                         return err;
845         }
846
847         switch (cmd) {
848         case IPC_INFO:
849         {
850                 struct shminfo64 shminfo;
851
852                 memset(&shminfo, 0, sizeof(shminfo));
853                 shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
854                 shminfo.shmmax = ns->shm_ctlmax;
855                 shminfo.shmall = ns->shm_ctlall;
856
857                 shminfo.shmmin = SHMMIN;
858                 if (copy_shminfo_to_user(buf, &shminfo, version))
859                         return -EFAULT;
860
861                 down_read(&shm_ids(ns).rwsem);
862                 err = ipc_get_maxid(&shm_ids(ns));
863                 up_read(&shm_ids(ns).rwsem);
864
865                 if (err < 0)
866                         err = 0;
867                 goto out;
868         }
869         case SHM_INFO:
870         {
871                 struct shm_info shm_info;
872
873                 memset(&shm_info, 0, sizeof(shm_info));
874                 down_read(&shm_ids(ns).rwsem);
875                 shm_info.used_ids = shm_ids(ns).in_use;
876                 shm_get_stat(ns, &shm_info.shm_rss, &shm_info.shm_swp);
877                 shm_info.shm_tot = ns->shm_tot;
878                 shm_info.swap_attempts = 0;
879                 shm_info.swap_successes = 0;
880                 err = ipc_get_maxid(&shm_ids(ns));
881                 up_read(&shm_ids(ns).rwsem);
882                 if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
883                         err = -EFAULT;
884                         goto out;
885                 }
886
887                 err = err < 0 ? 0 : err;
888                 goto out;
889         }
890         case SHM_STAT:
891         case IPC_STAT:
892         {
893                 struct shmid64_ds tbuf;
894                 int result;
895
896                 rcu_read_lock();
897                 if (cmd == SHM_STAT) {
898                         shp = shm_obtain_object(ns, shmid);
899                         if (IS_ERR(shp)) {
900                                 err = PTR_ERR(shp);
901                                 goto out_unlock;
902                         }
903                         result = shp->shm_perm.id;
904                 } else {
905                         shp = shm_obtain_object_check(ns, shmid);
906                         if (IS_ERR(shp)) {
907                                 err = PTR_ERR(shp);
908                                 goto out_unlock;
909                         }
910                         result = 0;
911                 }
912
913                 err = -EACCES;
914                 if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
915                         goto out_unlock;
916
917                 err = security_shm_shmctl(shp, cmd);
918                 if (err)
919                         goto out_unlock;
920
921                 memset(&tbuf, 0, sizeof(tbuf));
922                 kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
923                 tbuf.shm_segsz  = shp->shm_segsz;
924                 tbuf.shm_atime  = shp->shm_atim;
925                 tbuf.shm_dtime  = shp->shm_dtim;
926                 tbuf.shm_ctime  = shp->shm_ctim;
927                 tbuf.shm_cpid   = shp->shm_cprid;
928                 tbuf.shm_lpid   = shp->shm_lprid;
929                 tbuf.shm_nattch = shp->shm_nattch;
930                 rcu_read_unlock();
931
932                 if (copy_shmid_to_user(buf, &tbuf, version))
933                         err = -EFAULT;
934                 else
935                         err = result;
936                 goto out;
937         }
938         default:
939                 return -EINVAL;
940         }
941
942 out_unlock:
943         rcu_read_unlock();
944 out:
945         return err;
946 }
947
948 SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
949 {
950         struct shmid_kernel *shp;
951         int err, version;
952         struct ipc_namespace *ns;
953
954         if (cmd < 0 || shmid < 0)
955                 return -EINVAL;
956
957         version = ipc_parse_version(&cmd);
958         ns = current->nsproxy->ipc_ns;
959
960         switch (cmd) {
961         case IPC_INFO:
962         case SHM_INFO:
963         case SHM_STAT:
964         case IPC_STAT:
965                 return shmctl_nolock(ns, shmid, cmd, version, buf);
966         case IPC_RMID:
967         case IPC_SET:
968                 return shmctl_down(ns, shmid, cmd, buf, version);
969         case SHM_LOCK:
970         case SHM_UNLOCK:
971         {
972                 struct file *shm_file;
973
974                 rcu_read_lock();
975                 shp = shm_obtain_object_check(ns, shmid);
976                 if (IS_ERR(shp)) {
977                         err = PTR_ERR(shp);
978                         goto out_unlock1;
979                 }
980
981                 audit_ipc_obj(&(shp->shm_perm));
982                 err = security_shm_shmctl(shp, cmd);
983                 if (err)
984                         goto out_unlock1;
985
986                 ipc_lock_object(&shp->shm_perm);
987
988                 /* check if shm_destroy() is tearing down shp */
989                 if (!ipc_valid_object(&shp->shm_perm)) {
990                         err = -EIDRM;
991                         goto out_unlock0;
992                 }
993
994                 if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
995                         kuid_t euid = current_euid();
996                         if (!uid_eq(euid, shp->shm_perm.uid) &&
997                             !uid_eq(euid, shp->shm_perm.cuid)) {
998                                 err = -EPERM;
999                                 goto out_unlock0;
1000                         }
1001                         if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
1002                                 err = -EPERM;
1003                                 goto out_unlock0;
1004                         }
1005                 }
1006
1007                 shm_file = shp->shm_file;
1008                 if (is_file_hugepages(shm_file))
1009                         goto out_unlock0;
1010
1011                 if (cmd == SHM_LOCK) {
1012                         struct user_struct *user = current_user();
1013                         err = shmem_lock(shm_file, 1, user);
1014                         if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
1015                                 shp->shm_perm.mode |= SHM_LOCKED;
1016                                 shp->mlock_user = user;
1017                         }
1018                         goto out_unlock0;
1019                 }
1020
1021                 /* SHM_UNLOCK */
1022                 if (!(shp->shm_perm.mode & SHM_LOCKED))
1023                         goto out_unlock0;
1024                 shmem_lock(shm_file, 0, shp->mlock_user);
1025                 shp->shm_perm.mode &= ~SHM_LOCKED;
1026                 shp->mlock_user = NULL;
1027                 get_file(shm_file);
1028                 ipc_unlock_object(&shp->shm_perm);
1029                 rcu_read_unlock();
1030                 shmem_unlock_mapping(shm_file->f_mapping);
1031
1032                 fput(shm_file);
1033                 return err;
1034         }
1035         default:
1036                 return -EINVAL;
1037         }
1038
1039 out_unlock0:
1040         ipc_unlock_object(&shp->shm_perm);
1041 out_unlock1:
1042         rcu_read_unlock();
1043         return err;
1044 }
1045
1046 /*
1047  * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
1048  *
1049  * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
1050  * "raddr" thing points to kernel space, and there has to be a wrapper around
1051  * this.
1052  */
1053 long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
1054               unsigned long shmlba)
1055 {
1056         struct shmid_kernel *shp;
1057         unsigned long addr;
1058         unsigned long size;
1059         struct file *file;
1060         int    err;
1061         unsigned long flags;
1062         unsigned long prot;
1063         int acc_mode;
1064         struct ipc_namespace *ns;
1065         struct shm_file_data *sfd;
1066         struct path path;
1067         fmode_t f_mode;
1068         unsigned long populate = 0;
1069
1070         err = -EINVAL;
1071         if (shmid < 0)
1072                 goto out;
1073         else if ((addr = (ulong)shmaddr)) {
1074                 if (addr & (shmlba - 1)) {
1075                         if (shmflg & SHM_RND)
1076                                 addr &= ~(shmlba - 1);     /* round down */
1077                         else
1078 #ifndef __ARCH_FORCE_SHMLBA
1079                                 if (addr & ~PAGE_MASK)
1080 #endif
1081                                         goto out;
1082                 }
1083                 flags = MAP_SHARED | MAP_FIXED;
1084         } else {
1085                 if ((shmflg & SHM_REMAP))
1086                         goto out;
1087
1088                 flags = MAP_SHARED;
1089         }
1090
1091         if (shmflg & SHM_RDONLY) {
1092                 prot = PROT_READ;
1093                 acc_mode = S_IRUGO;
1094                 f_mode = FMODE_READ;
1095         } else {
1096                 prot = PROT_READ | PROT_WRITE;
1097                 acc_mode = S_IRUGO | S_IWUGO;
1098                 f_mode = FMODE_READ | FMODE_WRITE;
1099         }
1100         if (shmflg & SHM_EXEC) {
1101                 prot |= PROT_EXEC;
1102                 acc_mode |= S_IXUGO;
1103         }
1104
1105         /*
1106          * We cannot rely on the fs check since SYSV IPC does have an
1107          * additional creator id...
1108          */
1109         ns = current->nsproxy->ipc_ns;
1110         rcu_read_lock();
1111         shp = shm_obtain_object_check(ns, shmid);
1112         if (IS_ERR(shp)) {
1113                 err = PTR_ERR(shp);
1114                 goto out_unlock;
1115         }
1116
1117         err = -EACCES;
1118         if (ipcperms(ns, &shp->shm_perm, acc_mode))
1119                 goto out_unlock;
1120
1121         err = security_shm_shmat(shp, shmaddr, shmflg);
1122         if (err)
1123                 goto out_unlock;
1124
1125         ipc_lock_object(&shp->shm_perm);
1126
1127         /* check if shm_destroy() is tearing down shp */
1128         if (!ipc_valid_object(&shp->shm_perm)) {
1129                 ipc_unlock_object(&shp->shm_perm);
1130                 err = -EIDRM;
1131                 goto out_unlock;
1132         }
1133
1134         path = shp->shm_file->f_path;
1135         path_get(&path);
1136         shp->shm_nattch++;
1137         size = i_size_read(d_inode(path.dentry));
1138         ipc_unlock_object(&shp->shm_perm);
1139         rcu_read_unlock();
1140
1141         err = -ENOMEM;
1142         sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
1143         if (!sfd) {
1144                 path_put(&path);
1145                 goto out_nattch;
1146         }
1147
1148         file = alloc_file(&path, f_mode,
1149                           is_file_hugepages(shp->shm_file) ?
1150                                 &shm_file_operations_huge :
1151                                 &shm_file_operations);
1152         err = PTR_ERR(file);
1153         if (IS_ERR(file)) {
1154                 kfree(sfd);
1155                 path_put(&path);
1156                 goto out_nattch;
1157         }
1158
1159         file->private_data = sfd;
1160         file->f_mapping = shp->shm_file->f_mapping;
1161         sfd->id = shp->shm_perm.id;
1162         sfd->ns = get_ipc_ns(ns);
1163         sfd->file = shp->shm_file;
1164         sfd->vm_ops = NULL;
1165
1166         err = security_mmap_file(file, prot, flags);
1167         if (err)
1168                 goto out_fput;
1169
1170         down_write(&current->mm->mmap_sem);
1171         if (addr && !(shmflg & SHM_REMAP)) {
1172                 err = -EINVAL;
1173                 if (addr + size < addr)
1174                         goto invalid;
1175
1176                 if (find_vma_intersection(current->mm, addr, addr + size))
1177                         goto invalid;
1178         }
1179
1180         addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
1181         *raddr = addr;
1182         err = 0;
1183         if (IS_ERR_VALUE(addr))
1184                 err = (long)addr;
1185 invalid:
1186         up_write(&current->mm->mmap_sem);
1187         if (populate)
1188                 mm_populate(addr, populate);
1189
1190 out_fput:
1191         fput(file);
1192
1193 out_nattch:
1194         down_write(&shm_ids(ns).rwsem);
1195         shp = shm_lock(ns, shmid);
1196         shp->shm_nattch--;
1197         if (shm_may_destroy(ns, shp))
1198                 shm_destroy(ns, shp);
1199         else
1200                 shm_unlock(shp);
1201         up_write(&shm_ids(ns).rwsem);
1202         return err;
1203
1204 out_unlock:
1205         rcu_read_unlock();
1206 out:
1207         return err;
1208 }
1209
1210 SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
1211 {
1212         unsigned long ret;
1213         long err;
1214
1215         err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
1216         if (err)
1217                 return err;
1218         force_successful_syscall_return();
1219         return (long)ret;
1220 }
1221
1222 /*
1223  * detach and kill segment if marked destroyed.
1224  * The work is done in shm_close.
1225  */
1226 SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1227 {
1228         struct mm_struct *mm = current->mm;
1229         struct vm_area_struct *vma;
1230         unsigned long addr = (unsigned long)shmaddr;
1231         int retval = -EINVAL;
1232 #ifdef CONFIG_MMU
1233         loff_t size = 0;
1234         struct file *file;
1235         struct vm_area_struct *next;
1236 #endif
1237
1238         if (addr & ~PAGE_MASK)
1239                 return retval;
1240
1241         down_write(&mm->mmap_sem);
1242
1243         /*
1244          * This function tries to be smart and unmap shm segments that
1245          * were modified by partial mlock or munmap calls:
1246          * - It first determines the size of the shm segment that should be
1247          *   unmapped: It searches for a vma that is backed by shm and that
1248          *   started at address shmaddr. It records it's size and then unmaps
1249          *   it.
1250          * - Then it unmaps all shm vmas that started at shmaddr and that
1251          *   are within the initially determined size and that are from the
1252          *   same shm segment from which we determined the size.
1253          * Errors from do_munmap are ignored: the function only fails if
1254          * it's called with invalid parameters or if it's called to unmap
1255          * a part of a vma. Both calls in this function are for full vmas,
1256          * the parameters are directly copied from the vma itself and always
1257          * valid - therefore do_munmap cannot fail. (famous last words?)
1258          */
1259         /*
1260          * If it had been mremap()'d, the starting address would not
1261          * match the usual checks anyway. So assume all vma's are
1262          * above the starting address given.
1263          */
1264         vma = find_vma(mm, addr);
1265
1266 #ifdef CONFIG_MMU
1267         while (vma) {
1268                 next = vma->vm_next;
1269
1270                 /*
1271                  * Check if the starting address would match, i.e. it's
1272                  * a fragment created by mprotect() and/or munmap(), or it
1273                  * otherwise it starts at this address with no hassles.
1274                  */
1275                 if ((vma->vm_ops == &shm_vm_ops) &&
1276                         (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
1277
1278                         /*
1279                          * Record the file of the shm segment being
1280                          * unmapped.  With mremap(), someone could place
1281                          * page from another segment but with equal offsets
1282                          * in the range we are unmapping.
1283                          */
1284                         file = vma->vm_file;
1285                         size = i_size_read(file_inode(vma->vm_file));
1286                         do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1287                         /*
1288                          * We discovered the size of the shm segment, so
1289                          * break out of here and fall through to the next
1290                          * loop that uses the size information to stop
1291                          * searching for matching vma's.
1292                          */
1293                         retval = 0;
1294                         vma = next;
1295                         break;
1296                 }
1297                 vma = next;
1298         }
1299
1300         /*
1301          * We need look no further than the maximum address a fragment
1302          * could possibly have landed at. Also cast things to loff_t to
1303          * prevent overflows and make comparisons vs. equal-width types.
1304          */
1305         size = PAGE_ALIGN(size);
1306         while (vma && (loff_t)(vma->vm_end - addr) <= size) {
1307                 next = vma->vm_next;
1308
1309                 /* finding a matching vma now does not alter retval */
1310                 if ((vma->vm_ops == &shm_vm_ops) &&
1311                     ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
1312                     (vma->vm_file == file))
1313                         do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1314                 vma = next;
1315         }
1316
1317 #else /* CONFIG_MMU */
1318         /* under NOMMU conditions, the exact address to be destroyed must be
1319          * given */
1320         if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1321                 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1322                 retval = 0;
1323         }
1324
1325 #endif
1326
1327         up_write(&mm->mmap_sem);
1328         return retval;
1329 }
1330
1331 #ifdef CONFIG_PROC_FS
1332 static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
1333 {
1334         struct user_namespace *user_ns = seq_user_ns(s);
1335         struct shmid_kernel *shp = it;
1336         unsigned long rss = 0, swp = 0;
1337
1338         shm_add_rss_swap(shp, &rss, &swp);
1339
1340 #if BITS_PER_LONG <= 32
1341 #define SIZE_SPEC "%10lu"
1342 #else
1343 #define SIZE_SPEC "%21lu"
1344 #endif
1345
1346         seq_printf(s,
1347                    "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
1348                    "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
1349                    SIZE_SPEC " " SIZE_SPEC "\n",
1350                    shp->shm_perm.key,
1351                    shp->shm_perm.id,
1352                    shp->shm_perm.mode,
1353                    shp->shm_segsz,
1354                    shp->shm_cprid,
1355                    shp->shm_lprid,
1356                    shp->shm_nattch,
1357                    from_kuid_munged(user_ns, shp->shm_perm.uid),
1358                    from_kgid_munged(user_ns, shp->shm_perm.gid),
1359                    from_kuid_munged(user_ns, shp->shm_perm.cuid),
1360                    from_kgid_munged(user_ns, shp->shm_perm.cgid),
1361                    shp->shm_atim,
1362                    shp->shm_dtim,
1363                    shp->shm_ctim,
1364                    rss * PAGE_SIZE,
1365                    swp * PAGE_SIZE);
1366
1367         return 0;
1368 }
1369 #endif