]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - fs/hugetlbfs/inode.c
pnfs/blocklayout: include vmalloc.h for __vmalloc
[karo-tx-linux.git] / fs / hugetlbfs / inode.c
1 /*
2  * hugetlbpage-backed filesystem.  Based on ramfs.
3  *
4  * Nadia Yvette Chambers, 2002
5  *
6  * Copyright (C) 2002 Linus Torvalds.
7  */
8
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11 #include <linux/module.h>
12 #include <linux/thread_info.h>
13 #include <asm/current.h>
14 #include <linux/sched.h>                /* remove ASAP */
15 #include <linux/fs.h>
16 #include <linux/mount.h>
17 #include <linux/file.h>
18 #include <linux/kernel.h>
19 #include <linux/writeback.h>
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/init.h>
23 #include <linux/string.h>
24 #include <linux/capability.h>
25 #include <linux/ctype.h>
26 #include <linux/backing-dev.h>
27 #include <linux/hugetlb.h>
28 #include <linux/pagevec.h>
29 #include <linux/parser.h>
30 #include <linux/mman.h>
31 #include <linux/slab.h>
32 #include <linux/dnotify.h>
33 #include <linux/statfs.h>
34 #include <linux/security.h>
35 #include <linux/magic.h>
36 #include <linux/migrate.h>
37
38 #include <asm/uaccess.h>
39
40 static const struct super_operations hugetlbfs_ops;
41 static const struct address_space_operations hugetlbfs_aops;
42 const struct file_operations hugetlbfs_file_operations;
43 static const struct inode_operations hugetlbfs_dir_inode_operations;
44 static const struct inode_operations hugetlbfs_inode_operations;
45
46 struct hugetlbfs_config {
47         kuid_t   uid;
48         kgid_t   gid;
49         umode_t mode;
50         long    nr_blocks;
51         long    nr_inodes;
52         struct hstate *hstate;
53 };
54
55 struct hugetlbfs_inode_info {
56         struct shared_policy policy;
57         struct inode vfs_inode;
58 };
59
60 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
61 {
62         return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
63 }
64
65 static struct backing_dev_info hugetlbfs_backing_dev_info = {
66         .name           = "hugetlbfs",
67         .ra_pages       = 0,    /* No readahead */
68         .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
69 };
70
71 int sysctl_hugetlb_shm_group;
72
73 enum {
74         Opt_size, Opt_nr_inodes,
75         Opt_mode, Opt_uid, Opt_gid,
76         Opt_pagesize,
77         Opt_err,
78 };
79
80 static const match_table_t tokens = {
81         {Opt_size,      "size=%s"},
82         {Opt_nr_inodes, "nr_inodes=%s"},
83         {Opt_mode,      "mode=%o"},
84         {Opt_uid,       "uid=%u"},
85         {Opt_gid,       "gid=%u"},
86         {Opt_pagesize,  "pagesize=%s"},
87         {Opt_err,       NULL},
88 };
89
90 static void huge_pagevec_release(struct pagevec *pvec)
91 {
92         int i;
93
94         for (i = 0; i < pagevec_count(pvec); ++i)
95                 put_page(pvec->pages[i]);
96
97         pagevec_reinit(pvec);
98 }
99
100 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
101 {
102         struct inode *inode = file_inode(file);
103         loff_t len, vma_len;
104         int ret;
105         struct hstate *h = hstate_file(file);
106
107         /*
108          * vma address alignment (but not the pgoff alignment) has
109          * already been checked by prepare_hugepage_range.  If you add
110          * any error returns here, do so after setting VM_HUGETLB, so
111          * is_vm_hugetlb_page tests below unmap_region go the right
112          * way when do_mmap_pgoff unwinds (may be important on powerpc
113          * and ia64).
114          */
115         vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
116         vma->vm_ops = &hugetlb_vm_ops;
117
118         if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
119                 return -EINVAL;
120
121         vma_len = (loff_t)(vma->vm_end - vma->vm_start);
122
123         mutex_lock(&inode->i_mutex);
124         file_accessed(file);
125
126         ret = -ENOMEM;
127         len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
128
129         if (hugetlb_reserve_pages(inode,
130                                 vma->vm_pgoff >> huge_page_order(h),
131                                 len >> huge_page_shift(h), vma,
132                                 vma->vm_flags))
133                 goto out;
134
135         ret = 0;
136         hugetlb_prefault_arch_hook(vma->vm_mm);
137         if (vma->vm_flags & VM_WRITE && inode->i_size < len)
138                 inode->i_size = len;
139 out:
140         mutex_unlock(&inode->i_mutex);
141
142         return ret;
143 }
144
145 /*
146  * Called under down_write(mmap_sem).
147  */
148
149 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
150 static unsigned long
151 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
152                 unsigned long len, unsigned long pgoff, unsigned long flags)
153 {
154         struct mm_struct *mm = current->mm;
155         struct vm_area_struct *vma;
156         struct hstate *h = hstate_file(file);
157         struct vm_unmapped_area_info info;
158
159         if (len & ~huge_page_mask(h))
160                 return -EINVAL;
161         if (len > TASK_SIZE)
162                 return -ENOMEM;
163
164         if (flags & MAP_FIXED) {
165                 if (prepare_hugepage_range(file, addr, len))
166                         return -EINVAL;
167                 return addr;
168         }
169
170         if (addr) {
171                 addr = ALIGN(addr, huge_page_size(h));
172                 vma = find_vma(mm, addr);
173                 if (TASK_SIZE - len >= addr &&
174                     (!vma || addr + len <= vma->vm_start))
175                         return addr;
176         }
177
178         info.flags = 0;
179         info.length = len;
180         info.low_limit = TASK_UNMAPPED_BASE;
181         info.high_limit = TASK_SIZE;
182         info.align_mask = PAGE_MASK & ~huge_page_mask(h);
183         info.align_offset = 0;
184         return vm_unmapped_area(&info);
185 }
186 #endif
187
188 static int
189 hugetlbfs_read_actor(struct page *page, unsigned long offset,
190                         char __user *buf, unsigned long count,
191                         unsigned long size)
192 {
193         char *kaddr;
194         unsigned long left, copied = 0;
195         int i, chunksize;
196
197         if (size > count)
198                 size = count;
199
200         /* Find which 4k chunk and offset with in that chunk */
201         i = offset >> PAGE_CACHE_SHIFT;
202         offset = offset & ~PAGE_CACHE_MASK;
203
204         while (size) {
205                 chunksize = PAGE_CACHE_SIZE;
206                 if (offset)
207                         chunksize -= offset;
208                 if (chunksize > size)
209                         chunksize = size;
210                 kaddr = kmap(&page[i]);
211                 left = __copy_to_user(buf, kaddr + offset, chunksize);
212                 kunmap(&page[i]);
213                 if (left) {
214                         copied += (chunksize - left);
215                         break;
216                 }
217                 offset = 0;
218                 size -= chunksize;
219                 buf += chunksize;
220                 copied += chunksize;
221                 i++;
222         }
223         return copied ? copied : -EFAULT;
224 }
225
226 /*
227  * Support for read() - Find the page attached to f_mapping and copy out the
228  * data. Its *very* similar to do_generic_mapping_read(), we can't use that
229  * since it has PAGE_CACHE_SIZE assumptions.
230  */
231 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
232                               size_t len, loff_t *ppos)
233 {
234         struct hstate *h = hstate_file(filp);
235         struct address_space *mapping = filp->f_mapping;
236         struct inode *inode = mapping->host;
237         unsigned long index = *ppos >> huge_page_shift(h);
238         unsigned long offset = *ppos & ~huge_page_mask(h);
239         unsigned long end_index;
240         loff_t isize;
241         ssize_t retval = 0;
242
243         /* validate length */
244         if (len == 0)
245                 goto out;
246
247         for (;;) {
248                 struct page *page;
249                 unsigned long nr, ret;
250                 int ra;
251
252                 /* nr is the maximum number of bytes to copy from this page */
253                 nr = huge_page_size(h);
254                 isize = i_size_read(inode);
255                 if (!isize)
256                         goto out;
257                 end_index = (isize - 1) >> huge_page_shift(h);
258                 if (index >= end_index) {
259                         if (index > end_index)
260                                 goto out;
261                         nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
262                         if (nr <= offset)
263                                 goto out;
264                 }
265                 nr = nr - offset;
266
267                 /* Find the page */
268                 page = find_lock_page(mapping, index);
269                 if (unlikely(page == NULL)) {
270                         /*
271                          * We have a HOLE, zero out the user-buffer for the
272                          * length of the hole or request.
273                          */
274                         ret = len < nr ? len : nr;
275                         if (clear_user(buf, ret))
276                                 ra = -EFAULT;
277                         else
278                                 ra = 0;
279                 } else {
280                         unlock_page(page);
281
282                         /*
283                          * We have the page, copy it to user space buffer.
284                          */
285                         ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
286                         ret = ra;
287                         page_cache_release(page);
288                 }
289                 if (ra < 0) {
290                         if (retval == 0)
291                                 retval = ra;
292                         goto out;
293                 }
294
295                 offset += ret;
296                 retval += ret;
297                 len -= ret;
298                 index += offset >> huge_page_shift(h);
299                 offset &= ~huge_page_mask(h);
300
301                 /* short read or no more work */
302                 if ((ret != nr) || (len == 0))
303                         break;
304         }
305 out:
306         *ppos = ((loff_t)index << huge_page_shift(h)) + offset;
307         return retval;
308 }
309
310 static int hugetlbfs_write_begin(struct file *file,
311                         struct address_space *mapping,
312                         loff_t pos, unsigned len, unsigned flags,
313                         struct page **pagep, void **fsdata)
314 {
315         return -EINVAL;
316 }
317
318 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
319                         loff_t pos, unsigned len, unsigned copied,
320                         struct page *page, void *fsdata)
321 {
322         BUG();
323         return -EINVAL;
324 }
325
326 static void truncate_huge_page(struct page *page)
327 {
328         cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
329         ClearPageUptodate(page);
330         delete_from_page_cache(page);
331 }
332
333 static void truncate_hugepages(struct inode *inode, loff_t lstart)
334 {
335         struct hstate *h = hstate_inode(inode);
336         struct address_space *mapping = &inode->i_data;
337         const pgoff_t start = lstart >> huge_page_shift(h);
338         struct pagevec pvec;
339         pgoff_t next;
340         int i, freed = 0;
341
342         pagevec_init(&pvec, 0);
343         next = start;
344         while (1) {
345                 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
346                         if (next == start)
347                                 break;
348                         next = start;
349                         continue;
350                 }
351
352                 for (i = 0; i < pagevec_count(&pvec); ++i) {
353                         struct page *page = pvec.pages[i];
354
355                         lock_page(page);
356                         if (page->index > next)
357                                 next = page->index;
358                         ++next;
359                         truncate_huge_page(page);
360                         unlock_page(page);
361                         freed++;
362                 }
363                 huge_pagevec_release(&pvec);
364         }
365         BUG_ON(!lstart && mapping->nrpages);
366         hugetlb_unreserve_pages(inode, start, freed);
367 }
368
369 static void hugetlbfs_evict_inode(struct inode *inode)
370 {
371         struct resv_map *resv_map;
372
373         truncate_hugepages(inode, 0);
374         resv_map = (struct resv_map *)inode->i_mapping->private_data;
375         /* root inode doesn't have the resv_map, so we should check it */
376         if (resv_map)
377                 resv_map_release(&resv_map->refs);
378         clear_inode(inode);
379 }
380
381 static inline void
382 hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
383 {
384         struct vm_area_struct *vma;
385
386         vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
387                 unsigned long v_offset;
388
389                 /*
390                  * Can the expression below overflow on 32-bit arches?
391                  * No, because the interval tree returns us only those vmas
392                  * which overlap the truncated area starting at pgoff,
393                  * and no vma on a 32-bit arch can span beyond the 4GB.
394                  */
395                 if (vma->vm_pgoff < pgoff)
396                         v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
397                 else
398                         v_offset = 0;
399
400                 unmap_hugepage_range(vma, vma->vm_start + v_offset,
401                                      vma->vm_end, NULL);
402         }
403 }
404
405 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
406 {
407         pgoff_t pgoff;
408         struct address_space *mapping = inode->i_mapping;
409         struct hstate *h = hstate_inode(inode);
410
411         BUG_ON(offset & ~huge_page_mask(h));
412         pgoff = offset >> PAGE_SHIFT;
413
414         i_size_write(inode, offset);
415         mutex_lock(&mapping->i_mmap_mutex);
416         if (!RB_EMPTY_ROOT(&mapping->i_mmap))
417                 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
418         mutex_unlock(&mapping->i_mmap_mutex);
419         truncate_hugepages(inode, offset);
420         return 0;
421 }
422
423 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
424 {
425         struct inode *inode = dentry->d_inode;
426         struct hstate *h = hstate_inode(inode);
427         int error;
428         unsigned int ia_valid = attr->ia_valid;
429
430         BUG_ON(!inode);
431
432         error = inode_change_ok(inode, attr);
433         if (error)
434                 return error;
435
436         if (ia_valid & ATTR_SIZE) {
437                 error = -EINVAL;
438                 if (attr->ia_size & ~huge_page_mask(h))
439                         return -EINVAL;
440                 error = hugetlb_vmtruncate(inode, attr->ia_size);
441                 if (error)
442                         return error;
443         }
444
445         setattr_copy(inode, attr);
446         mark_inode_dirty(inode);
447         return 0;
448 }
449
450 static struct inode *hugetlbfs_get_root(struct super_block *sb,
451                                         struct hugetlbfs_config *config)
452 {
453         struct inode *inode;
454
455         inode = new_inode(sb);
456         if (inode) {
457                 struct hugetlbfs_inode_info *info;
458                 inode->i_ino = get_next_ino();
459                 inode->i_mode = S_IFDIR | config->mode;
460                 inode->i_uid = config->uid;
461                 inode->i_gid = config->gid;
462                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
463                 info = HUGETLBFS_I(inode);
464                 mpol_shared_policy_init(&info->policy, NULL);
465                 inode->i_op = &hugetlbfs_dir_inode_operations;
466                 inode->i_fop = &simple_dir_operations;
467                 /* directory inodes start off with i_nlink == 2 (for "." entry) */
468                 inc_nlink(inode);
469                 lockdep_annotate_inode_mutex_key(inode);
470         }
471         return inode;
472 }
473
474 /*
475  * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never
476  * be taken from reclaim -- unlike regular filesystems. This needs an
477  * annotation because huge_pmd_share() does an allocation under
478  * i_mmap_mutex.
479  */
480 static struct lock_class_key hugetlbfs_i_mmap_mutex_key;
481
482 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
483                                         struct inode *dir,
484                                         umode_t mode, dev_t dev)
485 {
486         struct inode *inode;
487         struct resv_map *resv_map;
488
489         resv_map = resv_map_alloc();
490         if (!resv_map)
491                 return NULL;
492
493         inode = new_inode(sb);
494         if (inode) {
495                 struct hugetlbfs_inode_info *info;
496                 inode->i_ino = get_next_ino();
497                 inode_init_owner(inode, dir, mode);
498                 lockdep_set_class(&inode->i_mapping->i_mmap_mutex,
499                                 &hugetlbfs_i_mmap_mutex_key);
500                 inode->i_mapping->a_ops = &hugetlbfs_aops;
501                 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
502                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
503                 inode->i_mapping->private_data = resv_map;
504                 info = HUGETLBFS_I(inode);
505                 /*
506                  * The policy is initialized here even if we are creating a
507                  * private inode because initialization simply creates an
508                  * an empty rb tree and calls spin_lock_init(), later when we
509                  * call mpol_free_shared_policy() it will just return because
510                  * the rb tree will still be empty.
511                  */
512                 mpol_shared_policy_init(&info->policy, NULL);
513                 switch (mode & S_IFMT) {
514                 default:
515                         init_special_inode(inode, mode, dev);
516                         break;
517                 case S_IFREG:
518                         inode->i_op = &hugetlbfs_inode_operations;
519                         inode->i_fop = &hugetlbfs_file_operations;
520                         break;
521                 case S_IFDIR:
522                         inode->i_op = &hugetlbfs_dir_inode_operations;
523                         inode->i_fop = &simple_dir_operations;
524
525                         /* directory inodes start off with i_nlink == 2 (for "." entry) */
526                         inc_nlink(inode);
527                         break;
528                 case S_IFLNK:
529                         inode->i_op = &page_symlink_inode_operations;
530                         break;
531                 }
532                 lockdep_annotate_inode_mutex_key(inode);
533         } else
534                 kref_put(&resv_map->refs, resv_map_release);
535
536         return inode;
537 }
538
539 /*
540  * File creation. Allocate an inode, and we're done..
541  */
542 static int hugetlbfs_mknod(struct inode *dir,
543                         struct dentry *dentry, umode_t mode, dev_t dev)
544 {
545         struct inode *inode;
546         int error = -ENOSPC;
547
548         inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
549         if (inode) {
550                 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
551                 d_instantiate(dentry, inode);
552                 dget(dentry);   /* Extra count - pin the dentry in core */
553                 error = 0;
554         }
555         return error;
556 }
557
558 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
559 {
560         int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
561         if (!retval)
562                 inc_nlink(dir);
563         return retval;
564 }
565
566 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
567 {
568         return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
569 }
570
571 static int hugetlbfs_symlink(struct inode *dir,
572                         struct dentry *dentry, const char *symname)
573 {
574         struct inode *inode;
575         int error = -ENOSPC;
576
577         inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
578         if (inode) {
579                 int l = strlen(symname)+1;
580                 error = page_symlink(inode, symname, l);
581                 if (!error) {
582                         d_instantiate(dentry, inode);
583                         dget(dentry);
584                 } else
585                         iput(inode);
586         }
587         dir->i_ctime = dir->i_mtime = CURRENT_TIME;
588
589         return error;
590 }
591
592 /*
593  * mark the head page dirty
594  */
595 static int hugetlbfs_set_page_dirty(struct page *page)
596 {
597         struct page *head = compound_head(page);
598
599         SetPageDirty(head);
600         return 0;
601 }
602
603 static int hugetlbfs_migrate_page(struct address_space *mapping,
604                                 struct page *newpage, struct page *page,
605                                 enum migrate_mode mode)
606 {
607         int rc;
608
609         rc = migrate_huge_page_move_mapping(mapping, newpage, page);
610         if (rc != MIGRATEPAGE_SUCCESS)
611                 return rc;
612         migrate_page_copy(newpage, page);
613
614         return MIGRATEPAGE_SUCCESS;
615 }
616
617 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
618 {
619         struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
620         struct hstate *h = hstate_inode(dentry->d_inode);
621
622         buf->f_type = HUGETLBFS_MAGIC;
623         buf->f_bsize = huge_page_size(h);
624         if (sbinfo) {
625                 spin_lock(&sbinfo->stat_lock);
626                 /* If no limits set, just report 0 for max/free/used
627                  * blocks, like simple_statfs() */
628                 if (sbinfo->spool) {
629                         long free_pages;
630
631                         spin_lock(&sbinfo->spool->lock);
632                         buf->f_blocks = sbinfo->spool->max_hpages;
633                         free_pages = sbinfo->spool->max_hpages
634                                 - sbinfo->spool->used_hpages;
635                         buf->f_bavail = buf->f_bfree = free_pages;
636                         spin_unlock(&sbinfo->spool->lock);
637                         buf->f_files = sbinfo->max_inodes;
638                         buf->f_ffree = sbinfo->free_inodes;
639                 }
640                 spin_unlock(&sbinfo->stat_lock);
641         }
642         buf->f_namelen = NAME_MAX;
643         return 0;
644 }
645
646 static void hugetlbfs_put_super(struct super_block *sb)
647 {
648         struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
649
650         if (sbi) {
651                 sb->s_fs_info = NULL;
652
653                 if (sbi->spool)
654                         hugepage_put_subpool(sbi->spool);
655
656                 kfree(sbi);
657         }
658 }
659
660 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
661 {
662         if (sbinfo->free_inodes >= 0) {
663                 spin_lock(&sbinfo->stat_lock);
664                 if (unlikely(!sbinfo->free_inodes)) {
665                         spin_unlock(&sbinfo->stat_lock);
666                         return 0;
667                 }
668                 sbinfo->free_inodes--;
669                 spin_unlock(&sbinfo->stat_lock);
670         }
671
672         return 1;
673 }
674
675 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
676 {
677         if (sbinfo->free_inodes >= 0) {
678                 spin_lock(&sbinfo->stat_lock);
679                 sbinfo->free_inodes++;
680                 spin_unlock(&sbinfo->stat_lock);
681         }
682 }
683
684
685 static struct kmem_cache *hugetlbfs_inode_cachep;
686
687 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
688 {
689         struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
690         struct hugetlbfs_inode_info *p;
691
692         if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
693                 return NULL;
694         p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
695         if (unlikely(!p)) {
696                 hugetlbfs_inc_free_inodes(sbinfo);
697                 return NULL;
698         }
699         return &p->vfs_inode;
700 }
701
702 static void hugetlbfs_i_callback(struct rcu_head *head)
703 {
704         struct inode *inode = container_of(head, struct inode, i_rcu);
705         kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
706 }
707
708 static void hugetlbfs_destroy_inode(struct inode *inode)
709 {
710         hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
711         mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
712         call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
713 }
714
715 static const struct address_space_operations hugetlbfs_aops = {
716         .write_begin    = hugetlbfs_write_begin,
717         .write_end      = hugetlbfs_write_end,
718         .set_page_dirty = hugetlbfs_set_page_dirty,
719         .migratepage    = hugetlbfs_migrate_page,
720 };
721
722
723 static void init_once(void *foo)
724 {
725         struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
726
727         inode_init_once(&ei->vfs_inode);
728 }
729
730 const struct file_operations hugetlbfs_file_operations = {
731         .read                   = hugetlbfs_read,
732         .mmap                   = hugetlbfs_file_mmap,
733         .fsync                  = noop_fsync,
734         .get_unmapped_area      = hugetlb_get_unmapped_area,
735         .llseek         = default_llseek,
736 };
737
738 static const struct inode_operations hugetlbfs_dir_inode_operations = {
739         .create         = hugetlbfs_create,
740         .lookup         = simple_lookup,
741         .link           = simple_link,
742         .unlink         = simple_unlink,
743         .symlink        = hugetlbfs_symlink,
744         .mkdir          = hugetlbfs_mkdir,
745         .rmdir          = simple_rmdir,
746         .mknod          = hugetlbfs_mknod,
747         .rename         = simple_rename,
748         .setattr        = hugetlbfs_setattr,
749 };
750
751 static const struct inode_operations hugetlbfs_inode_operations = {
752         .setattr        = hugetlbfs_setattr,
753 };
754
755 static const struct super_operations hugetlbfs_ops = {
756         .alloc_inode    = hugetlbfs_alloc_inode,
757         .destroy_inode  = hugetlbfs_destroy_inode,
758         .evict_inode    = hugetlbfs_evict_inode,
759         .statfs         = hugetlbfs_statfs,
760         .put_super      = hugetlbfs_put_super,
761         .show_options   = generic_show_options,
762 };
763
764 static int
765 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
766 {
767         char *p, *rest;
768         substring_t args[MAX_OPT_ARGS];
769         int option;
770         unsigned long long size = 0;
771         enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
772
773         if (!options)
774                 return 0;
775
776         while ((p = strsep(&options, ",")) != NULL) {
777                 int token;
778                 if (!*p)
779                         continue;
780
781                 token = match_token(p, tokens, args);
782                 switch (token) {
783                 case Opt_uid:
784                         if (match_int(&args[0], &option))
785                                 goto bad_val;
786                         pconfig->uid = make_kuid(current_user_ns(), option);
787                         if (!uid_valid(pconfig->uid))
788                                 goto bad_val;
789                         break;
790
791                 case Opt_gid:
792                         if (match_int(&args[0], &option))
793                                 goto bad_val;
794                         pconfig->gid = make_kgid(current_user_ns(), option);
795                         if (!gid_valid(pconfig->gid))
796                                 goto bad_val;
797                         break;
798
799                 case Opt_mode:
800                         if (match_octal(&args[0], &option))
801                                 goto bad_val;
802                         pconfig->mode = option & 01777U;
803                         break;
804
805                 case Opt_size: {
806                         /* memparse() will accept a K/M/G without a digit */
807                         if (!isdigit(*args[0].from))
808                                 goto bad_val;
809                         size = memparse(args[0].from, &rest);
810                         setsize = SIZE_STD;
811                         if (*rest == '%')
812                                 setsize = SIZE_PERCENT;
813                         break;
814                 }
815
816                 case Opt_nr_inodes:
817                         /* memparse() will accept a K/M/G without a digit */
818                         if (!isdigit(*args[0].from))
819                                 goto bad_val;
820                         pconfig->nr_inodes = memparse(args[0].from, &rest);
821                         break;
822
823                 case Opt_pagesize: {
824                         unsigned long ps;
825                         ps = memparse(args[0].from, &rest);
826                         pconfig->hstate = size_to_hstate(ps);
827                         if (!pconfig->hstate) {
828                                 pr_err("Unsupported page size %lu MB\n",
829                                         ps >> 20);
830                                 return -EINVAL;
831                         }
832                         break;
833                 }
834
835                 default:
836                         pr_err("Bad mount option: \"%s\"\n", p);
837                         return -EINVAL;
838                         break;
839                 }
840         }
841
842         /* Do size after hstate is set up */
843         if (setsize > NO_SIZE) {
844                 struct hstate *h = pconfig->hstate;
845                 if (setsize == SIZE_PERCENT) {
846                         size <<= huge_page_shift(h);
847                         size *= h->max_huge_pages;
848                         do_div(size, 100);
849                 }
850                 pconfig->nr_blocks = (size >> huge_page_shift(h));
851         }
852
853         return 0;
854
855 bad_val:
856         pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
857         return -EINVAL;
858 }
859
860 static int
861 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
862 {
863         int ret;
864         struct hugetlbfs_config config;
865         struct hugetlbfs_sb_info *sbinfo;
866
867         save_mount_options(sb, data);
868
869         config.nr_blocks = -1; /* No limit on size by default */
870         config.nr_inodes = -1; /* No limit on number of inodes by default */
871         config.uid = current_fsuid();
872         config.gid = current_fsgid();
873         config.mode = 0755;
874         config.hstate = &default_hstate;
875         ret = hugetlbfs_parse_options(data, &config);
876         if (ret)
877                 return ret;
878
879         sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
880         if (!sbinfo)
881                 return -ENOMEM;
882         sb->s_fs_info = sbinfo;
883         sbinfo->hstate = config.hstate;
884         spin_lock_init(&sbinfo->stat_lock);
885         sbinfo->max_inodes = config.nr_inodes;
886         sbinfo->free_inodes = config.nr_inodes;
887         sbinfo->spool = NULL;
888         if (config.nr_blocks != -1) {
889                 sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
890                 if (!sbinfo->spool)
891                         goto out_free;
892         }
893         sb->s_maxbytes = MAX_LFS_FILESIZE;
894         sb->s_blocksize = huge_page_size(config.hstate);
895         sb->s_blocksize_bits = huge_page_shift(config.hstate);
896         sb->s_magic = HUGETLBFS_MAGIC;
897         sb->s_op = &hugetlbfs_ops;
898         sb->s_time_gran = 1;
899         sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
900         if (!sb->s_root)
901                 goto out_free;
902         return 0;
903 out_free:
904         kfree(sbinfo->spool);
905         kfree(sbinfo);
906         return -ENOMEM;
907 }
908
909 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
910         int flags, const char *dev_name, void *data)
911 {
912         return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
913 }
914
915 static struct file_system_type hugetlbfs_fs_type = {
916         .name           = "hugetlbfs",
917         .mount          = hugetlbfs_mount,
918         .kill_sb        = kill_litter_super,
919 };
920 MODULE_ALIAS_FS("hugetlbfs");
921
922 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
923
924 static int can_do_hugetlb_shm(void)
925 {
926         kgid_t shm_group;
927         shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
928         return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
929 }
930
931 static int get_hstate_idx(int page_size_log)
932 {
933         struct hstate *h = hstate_sizelog(page_size_log);
934
935         if (!h)
936                 return -1;
937         return h - hstates;
938 }
939
940 static const struct dentry_operations anon_ops = {
941         .d_dname = simple_dname
942 };
943
944 /*
945  * Note that size should be aligned to proper hugepage size in caller side,
946  * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
947  */
948 struct file *hugetlb_file_setup(const char *name, size_t size,
949                                 vm_flags_t acctflag, struct user_struct **user,
950                                 int creat_flags, int page_size_log)
951 {
952         struct file *file = ERR_PTR(-ENOMEM);
953         struct inode *inode;
954         struct path path;
955         struct super_block *sb;
956         struct qstr quick_string;
957         int hstate_idx;
958
959         hstate_idx = get_hstate_idx(page_size_log);
960         if (hstate_idx < 0)
961                 return ERR_PTR(-ENODEV);
962
963         *user = NULL;
964         if (!hugetlbfs_vfsmount[hstate_idx])
965                 return ERR_PTR(-ENOENT);
966
967         if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
968                 *user = current_user();
969                 if (user_shm_lock(size, *user)) {
970                         task_lock(current);
971                         pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
972                                 current->comm, current->pid);
973                         task_unlock(current);
974                 } else {
975                         *user = NULL;
976                         return ERR_PTR(-EPERM);
977                 }
978         }
979
980         sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
981         quick_string.name = name;
982         quick_string.len = strlen(quick_string.name);
983         quick_string.hash = 0;
984         path.dentry = d_alloc_pseudo(sb, &quick_string);
985         if (!path.dentry)
986                 goto out_shm_unlock;
987
988         d_set_d_op(path.dentry, &anon_ops);
989         path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
990         file = ERR_PTR(-ENOSPC);
991         inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
992         if (!inode)
993                 goto out_dentry;
994
995         file = ERR_PTR(-ENOMEM);
996         if (hugetlb_reserve_pages(inode, 0,
997                         size >> huge_page_shift(hstate_inode(inode)), NULL,
998                         acctflag))
999                 goto out_inode;
1000
1001         d_instantiate(path.dentry, inode);
1002         inode->i_size = size;
1003         clear_nlink(inode);
1004
1005         file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
1006                         &hugetlbfs_file_operations);
1007         if (IS_ERR(file))
1008                 goto out_dentry; /* inode is already attached */
1009
1010         return file;
1011
1012 out_inode:
1013         iput(inode);
1014 out_dentry:
1015         path_put(&path);
1016 out_shm_unlock:
1017         if (*user) {
1018                 user_shm_unlock(size, *user);
1019                 *user = NULL;
1020         }
1021         return file;
1022 }
1023
1024 static int __init init_hugetlbfs_fs(void)
1025 {
1026         struct hstate *h;
1027         int error;
1028         int i;
1029
1030         if (!hugepages_supported()) {
1031                 pr_info("disabling because there are no supported hugepage sizes\n");
1032                 return -ENOTSUPP;
1033         }
1034
1035         error = bdi_init(&hugetlbfs_backing_dev_info);
1036         if (error)
1037                 return error;
1038
1039         error = -ENOMEM;
1040         hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1041                                         sizeof(struct hugetlbfs_inode_info),
1042                                         0, 0, init_once);
1043         if (hugetlbfs_inode_cachep == NULL)
1044                 goto out2;
1045
1046         error = register_filesystem(&hugetlbfs_fs_type);
1047         if (error)
1048                 goto out;
1049
1050         i = 0;
1051         for_each_hstate(h) {
1052                 char buf[50];
1053                 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1054
1055                 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1056                 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1057                                                         buf);
1058
1059                 if (IS_ERR(hugetlbfs_vfsmount[i])) {
1060                         pr_err("Cannot mount internal hugetlbfs for "
1061                                 "page size %uK", ps_kb);
1062                         error = PTR_ERR(hugetlbfs_vfsmount[i]);
1063                         hugetlbfs_vfsmount[i] = NULL;
1064                 }
1065                 i++;
1066         }
1067         /* Non default hstates are optional */
1068         if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1069                 return 0;
1070
1071  out:
1072         kmem_cache_destroy(hugetlbfs_inode_cachep);
1073  out2:
1074         bdi_destroy(&hugetlbfs_backing_dev_info);
1075         return error;
1076 }
1077
1078 static void __exit exit_hugetlbfs_fs(void)
1079 {
1080         struct hstate *h;
1081         int i;
1082
1083
1084         /*
1085          * Make sure all delayed rcu free inodes are flushed before we
1086          * destroy cache.
1087          */
1088         rcu_barrier();
1089         kmem_cache_destroy(hugetlbfs_inode_cachep);
1090         i = 0;
1091         for_each_hstate(h)
1092                 kern_unmount(hugetlbfs_vfsmount[i++]);
1093         unregister_filesystem(&hugetlbfs_fs_type);
1094         bdi_destroy(&hugetlbfs_backing_dev_info);
1095 }
1096
1097 module_init(init_hugetlbfs_fs)
1098 module_exit(exit_hugetlbfs_fs)
1099
1100 MODULE_LICENSE("GPL");