]> git.kernelconcepts.de Git - karo-tx-linux.git/blob - mm/hugetlb.c
mm, hugetlb: improve page-fault scalability
[karo-tx-linux.git] / mm / hugetlb.c
1 /*
2  * Generic hugetlb support.
3  * (C) Nadia Yvette Chambers, April 2004
4  */
5 #include <linux/list.h>
6 #include <linux/init.h>
7 #include <linux/module.h>
8 #include <linux/mm.h>
9 #include <linux/seq_file.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/mmu_notifier.h>
13 #include <linux/nodemask.h>
14 #include <linux/pagemap.h>
15 #include <linux/mempolicy.h>
16 #include <linux/cpuset.h>
17 #include <linux/mutex.h>
18 #include <linux/bootmem.h>
19 #include <linux/sysfs.h>
20 #include <linux/slab.h>
21 #include <linux/rmap.h>
22 #include <linux/swap.h>
23 #include <linux/swapops.h>
24 #include <linux/page-isolation.h>
25 #include <linux/jhash.h>
26
27 #include <asm/page.h>
28 #include <asm/pgtable.h>
29 #include <asm/tlb.h>
30
31 #include <linux/io.h>
32 #include <linux/hugetlb.h>
33 #include <linux/hugetlb_cgroup.h>
34 #include <linux/node.h>
35 #include "internal.h"
36
37 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
38 unsigned long hugepages_treat_as_movable;
39
40 int hugetlb_max_hstate __read_mostly;
41 unsigned int default_hstate_idx;
42 struct hstate hstates[HUGE_MAX_HSTATE];
43
44 __initdata LIST_HEAD(huge_boot_pages);
45
46 /* for command line parsing */
47 static struct hstate * __initdata parsed_hstate;
48 static unsigned long __initdata default_hstate_max_huge_pages;
49 static unsigned long __initdata default_hstate_size;
50
51 /*
52  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
53  * free_huge_pages, and surplus_huge_pages.
54  */
55 DEFINE_SPINLOCK(hugetlb_lock);
56
57 /*
58  * Serializes faults on the same logical page.  This is used to
59  * prevent spurious OOMs when the hugepage pool is fully utilized.
60  */
61 static int num_fault_mutexes;
62 static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
63
64 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
65 {
66         bool free = (spool->count == 0) && (spool->used_hpages == 0);
67
68         spin_unlock(&spool->lock);
69
70         /* If no pages are used, and no other handles to the subpool
71          * remain, free the subpool the subpool remain */
72         if (free)
73                 kfree(spool);
74 }
75
76 struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
77 {
78         struct hugepage_subpool *spool;
79
80         spool = kmalloc(sizeof(*spool), GFP_KERNEL);
81         if (!spool)
82                 return NULL;
83
84         spin_lock_init(&spool->lock);
85         spool->count = 1;
86         spool->max_hpages = nr_blocks;
87         spool->used_hpages = 0;
88
89         return spool;
90 }
91
92 void hugepage_put_subpool(struct hugepage_subpool *spool)
93 {
94         spin_lock(&spool->lock);
95         BUG_ON(!spool->count);
96         spool->count--;
97         unlock_or_release_subpool(spool);
98 }
99
100 static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
101                                       long delta)
102 {
103         int ret = 0;
104
105         if (!spool)
106                 return 0;
107
108         spin_lock(&spool->lock);
109         if ((spool->used_hpages + delta) <= spool->max_hpages) {
110                 spool->used_hpages += delta;
111         } else {
112                 ret = -ENOMEM;
113         }
114         spin_unlock(&spool->lock);
115
116         return ret;
117 }
118
119 static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
120                                        long delta)
121 {
122         if (!spool)
123                 return;
124
125         spin_lock(&spool->lock);
126         spool->used_hpages -= delta;
127         /* If hugetlbfs_put_super couldn't free spool due to
128         * an outstanding quota reference, free it now. */
129         unlock_or_release_subpool(spool);
130 }
131
132 static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
133 {
134         return HUGETLBFS_SB(inode->i_sb)->spool;
135 }
136
137 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
138 {
139         return subpool_inode(file_inode(vma->vm_file));
140 }
141
142 /*
143  * Region tracking -- allows tracking of reservations and instantiated pages
144  *                    across the pages in a mapping.
145  *
146  * The region data structures are embedded into a resv_map and
147  * protected by a resv_map's lock
148  */
149 struct file_region {
150         struct list_head link;
151         long from;
152         long to;
153 };
154
155 static long region_add(struct resv_map *resv, long f, long t)
156 {
157         struct list_head *head = &resv->regions;
158         struct file_region *rg, *nrg, *trg;
159
160         spin_lock(&resv->lock);
161         /* Locate the region we are either in or before. */
162         list_for_each_entry(rg, head, link)
163                 if (f <= rg->to)
164                         break;
165
166         /* Round our left edge to the current segment if it encloses us. */
167         if (f > rg->from)
168                 f = rg->from;
169
170         /* Check for and consume any regions we now overlap with. */
171         nrg = rg;
172         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
173                 if (&rg->link == head)
174                         break;
175                 if (rg->from > t)
176                         break;
177
178                 /* If this area reaches higher then extend our area to
179                  * include it completely.  If this is not the first area
180                  * which we intend to reuse, free it. */
181                 if (rg->to > t)
182                         t = rg->to;
183                 if (rg != nrg) {
184                         list_del(&rg->link);
185                         kfree(rg);
186                 }
187         }
188         nrg->from = f;
189         nrg->to = t;
190         spin_unlock(&resv->lock);
191         return 0;
192 }
193
194 static long region_chg(struct resv_map *resv, long f, long t)
195 {
196         struct list_head *head = &resv->regions;
197         struct file_region *rg, *nrg = NULL;
198         long chg = 0;
199
200 retry:
201         spin_lock(&resv->lock);
202         /* Locate the region we are before or in. */
203         list_for_each_entry(rg, head, link)
204                 if (f <= rg->to)
205                         break;
206
207         /* If we are below the current region then a new region is required.
208          * Subtle, allocate a new region at the position but make it zero
209          * size such that we can guarantee to record the reservation. */
210         if (&rg->link == head || t < rg->from) {
211                 if (!nrg) {
212                         spin_unlock(&resv->lock);
213                         nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
214                         if (!nrg)
215                                 return -ENOMEM;
216
217                         nrg->from = f;
218                         nrg->to   = f;
219                         INIT_LIST_HEAD(&nrg->link);
220                         goto retry;
221                 }
222
223                 list_add(&nrg->link, rg->link.prev);
224                 chg = t - f;
225                 goto out_nrg;
226         }
227
228         /* Round our left edge to the current segment if it encloses us. */
229         if (f > rg->from)
230                 f = rg->from;
231         chg = t - f;
232
233         /* Check for and consume any regions we now overlap with. */
234         list_for_each_entry(rg, rg->link.prev, link) {
235                 if (&rg->link == head)
236                         break;
237                 if (rg->from > t)
238                         goto out;
239
240                 /* We overlap with this area, if it extends further than
241                  * us then we must extend ourselves.  Account for its
242                  * existing reservation. */
243                 if (rg->to > t) {
244                         chg += rg->to - t;
245                         t = rg->to;
246                 }
247                 chg -= rg->to - rg->from;
248         }
249
250 out:
251         spin_unlock(&resv->lock);
252         /*  We already know we raced and no longer need the new region */
253         kfree(nrg);
254         return chg;
255 out_nrg:
256         spin_unlock(&resv->lock);
257         return chg;
258 }
259
260 static long region_truncate(struct resv_map *resv, long end)
261 {
262         struct list_head *head = &resv->regions;
263         struct file_region *rg, *trg;
264         long chg = 0;
265
266         spin_lock(&resv->lock);
267         /* Locate the region we are either in or before. */
268         list_for_each_entry(rg, head, link)
269                 if (end <= rg->to)
270                         break;
271         if (&rg->link == head)
272                 goto out;
273
274         /* If we are in the middle of a region then adjust it. */
275         if (end > rg->from) {
276                 chg = rg->to - end;
277                 rg->to = end;
278                 rg = list_entry(rg->link.next, typeof(*rg), link);
279         }
280
281         /* Drop any remaining regions. */
282         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
283                 if (&rg->link == head)
284                         break;
285                 chg += rg->to - rg->from;
286                 list_del(&rg->link);
287                 kfree(rg);
288         }
289
290 out:
291         spin_unlock(&resv->lock);
292         return chg;
293 }
294
295 static long region_count(struct resv_map *resv, long f, long t)
296 {
297         struct list_head *head = &resv->regions;
298         struct file_region *rg;
299         long chg = 0;
300
301         spin_lock(&resv->lock);
302         /* Locate each segment we overlap with, and count that overlap. */
303         list_for_each_entry(rg, head, link) {
304                 long seg_from;
305                 long seg_to;
306
307                 if (rg->to <= f)
308                         continue;
309                 if (rg->from >= t)
310                         break;
311
312                 seg_from = max(rg->from, f);
313                 seg_to = min(rg->to, t);
314
315                 chg += seg_to - seg_from;
316         }
317         spin_unlock(&resv->lock);
318
319         return chg;
320 }
321
322 /*
323  * Convert the address within this vma to the page offset within
324  * the mapping, in pagecache page units; huge pages here.
325  */
326 static pgoff_t vma_hugecache_offset(struct hstate *h,
327                         struct vm_area_struct *vma, unsigned long address)
328 {
329         return ((address - vma->vm_start) >> huge_page_shift(h)) +
330                         (vma->vm_pgoff >> huge_page_order(h));
331 }
332
333 pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
334                                      unsigned long address)
335 {
336         return vma_hugecache_offset(hstate_vma(vma), vma, address);
337 }
338
339 /*
340  * Return the size of the pages allocated when backing a VMA. In the majority
341  * cases this will be same size as used by the page table entries.
342  */
343 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
344 {
345         struct hstate *hstate;
346
347         if (!is_vm_hugetlb_page(vma))
348                 return PAGE_SIZE;
349
350         hstate = hstate_vma(vma);
351
352         return 1UL << huge_page_shift(hstate);
353 }
354 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
355
356 /*
357  * Return the page size being used by the MMU to back a VMA. In the majority
358  * of cases, the page size used by the kernel matches the MMU size. On
359  * architectures where it differs, an architecture-specific version of this
360  * function is required.
361  */
362 #ifndef vma_mmu_pagesize
363 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
364 {
365         return vma_kernel_pagesize(vma);
366 }
367 #endif
368
369 /*
370  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
371  * bits of the reservation map pointer, which are always clear due to
372  * alignment.
373  */
374 #define HPAGE_RESV_OWNER    (1UL << 0)
375 #define HPAGE_RESV_UNMAPPED (1UL << 1)
376 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
377
378 /*
379  * These helpers are used to track how many pages are reserved for
380  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
381  * is guaranteed to have their future faults succeed.
382  *
383  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
384  * the reserve counters are updated with the hugetlb_lock held. It is safe
385  * to reset the VMA at fork() time as it is not in use yet and there is no
386  * chance of the global counters getting corrupted as a result of the values.
387  *
388  * The private mapping reservation is represented in a subtly different
389  * manner to a shared mapping.  A shared mapping has a region map associated
390  * with the underlying file, this region map represents the backing file
391  * pages which have ever had a reservation assigned which this persists even
392  * after the page is instantiated.  A private mapping has a region map
393  * associated with the original mmap which is attached to all VMAs which
394  * reference it, this region map represents those offsets which have consumed
395  * reservation ie. where pages have been instantiated.
396  */
397 static unsigned long get_vma_private_data(struct vm_area_struct *vma)
398 {
399         return (unsigned long)vma->vm_private_data;
400 }
401
402 static void set_vma_private_data(struct vm_area_struct *vma,
403                                                         unsigned long value)
404 {
405         vma->vm_private_data = (void *)value;
406 }
407
408 struct resv_map *resv_map_alloc(void)
409 {
410         struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
411         if (!resv_map)
412                 return NULL;
413
414         kref_init(&resv_map->refs);
415         spin_lock_init(&resv_map->lock);
416         INIT_LIST_HEAD(&resv_map->regions);
417
418         return resv_map;
419 }
420
421 void resv_map_release(struct kref *ref)
422 {
423         struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
424
425         /* Clear out any active regions before we release the map. */
426         region_truncate(resv_map, 0);
427         kfree(resv_map);
428 }
429
430 static inline struct resv_map *inode_resv_map(struct inode *inode)
431 {
432         return inode->i_mapping->private_data;
433 }
434
435 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
436 {
437         VM_BUG_ON(!is_vm_hugetlb_page(vma));
438         if (vma->vm_flags & VM_MAYSHARE) {
439                 struct address_space *mapping = vma->vm_file->f_mapping;
440                 struct inode *inode = mapping->host;
441
442                 return inode_resv_map(inode);
443
444         } else {
445                 return (struct resv_map *)(get_vma_private_data(vma) &
446                                                         ~HPAGE_RESV_MASK);
447         }
448 }
449
450 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
451 {
452         VM_BUG_ON(!is_vm_hugetlb_page(vma));
453         VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
454
455         set_vma_private_data(vma, (get_vma_private_data(vma) &
456                                 HPAGE_RESV_MASK) | (unsigned long)map);
457 }
458
459 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
460 {
461         VM_BUG_ON(!is_vm_hugetlb_page(vma));
462         VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
463
464         set_vma_private_data(vma, get_vma_private_data(vma) | flags);
465 }
466
467 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
468 {
469         VM_BUG_ON(!is_vm_hugetlb_page(vma));
470
471         return (get_vma_private_data(vma) & flag) != 0;
472 }
473
474 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
475 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
476 {
477         VM_BUG_ON(!is_vm_hugetlb_page(vma));
478         if (!(vma->vm_flags & VM_MAYSHARE))
479                 vma->vm_private_data = (void *)0;
480 }
481
482 /* Returns true if the VMA has associated reserve pages */
483 static int vma_has_reserves(struct vm_area_struct *vma, long chg)
484 {
485         if (vma->vm_flags & VM_NORESERVE) {
486                 /*
487                  * This address is already reserved by other process(chg == 0),
488                  * so, we should decrement reserved count. Without decrementing,
489                  * reserve count remains after releasing inode, because this
490                  * allocated page will go into page cache and is regarded as
491                  * coming from reserved pool in releasing step.  Currently, we
492                  * don't have any other solution to deal with this situation
493                  * properly, so add work-around here.
494                  */
495                 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
496                         return 1;
497                 else
498                         return 0;
499         }
500
501         /* Shared mappings always use reserves */
502         if (vma->vm_flags & VM_MAYSHARE)
503                 return 1;
504
505         /*
506          * Only the process that called mmap() has reserves for
507          * private mappings.
508          */
509         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
510                 return 1;
511
512         return 0;
513 }
514
515 static void enqueue_huge_page(struct hstate *h, struct page *page)
516 {
517         int nid = page_to_nid(page);
518         list_move(&page->lru, &h->hugepage_freelists[nid]);
519         h->free_huge_pages++;
520         h->free_huge_pages_node[nid]++;
521 }
522
523 static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
524 {
525         struct page *page;
526
527         list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
528                 if (!is_migrate_isolate_page(page))
529                         break;
530         /*
531          * if 'non-isolated free hugepage' not found on the list,
532          * the allocation fails.
533          */
534         if (&h->hugepage_freelists[nid] == &page->lru)
535                 return NULL;
536         list_move(&page->lru, &h->hugepage_activelist);
537         set_page_refcounted(page);
538         h->free_huge_pages--;
539         h->free_huge_pages_node[nid]--;
540         return page;
541 }
542
543 /* Movability of hugepages depends on migration support. */
544 static inline gfp_t htlb_alloc_mask(struct hstate *h)
545 {
546         if (hugepages_treat_as_movable || hugepage_migration_support(h))
547                 return GFP_HIGHUSER_MOVABLE;
548         else
549                 return GFP_HIGHUSER;
550 }
551
552 static struct page *dequeue_huge_page_vma(struct hstate *h,
553                                 struct vm_area_struct *vma,
554                                 unsigned long address, int avoid_reserve,
555                                 long chg)
556 {
557         struct page *page = NULL;
558         struct mempolicy *mpol;
559         nodemask_t *nodemask;
560         struct zonelist *zonelist;
561         struct zone *zone;
562         struct zoneref *z;
563         unsigned int cpuset_mems_cookie;
564
565         /*
566          * A child process with MAP_PRIVATE mappings created by their parent
567          * have no page reserves. This check ensures that reservations are
568          * not "stolen". The child may still get SIGKILLed
569          */
570         if (!vma_has_reserves(vma, chg) &&
571                         h->free_huge_pages - h->resv_huge_pages == 0)
572                 goto err;
573
574         /* If reserves cannot be used, ensure enough pages are in the pool */
575         if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
576                 goto err;
577
578 retry_cpuset:
579         cpuset_mems_cookie = read_mems_allowed_begin();
580         zonelist = huge_zonelist(vma, address,
581                                         htlb_alloc_mask(h), &mpol, &nodemask);
582
583         for_each_zone_zonelist_nodemask(zone, z, zonelist,
584                                                 MAX_NR_ZONES - 1, nodemask) {
585                 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
586                         page = dequeue_huge_page_node(h, zone_to_nid(zone));
587                         if (page) {
588                                 if (avoid_reserve)
589                                         break;
590                                 if (!vma_has_reserves(vma, chg))
591                                         break;
592
593                                 SetPagePrivate(page);
594                                 h->resv_huge_pages--;
595                                 break;
596                         }
597                 }
598         }
599
600         mpol_cond_put(mpol);
601         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
602                 goto retry_cpuset;
603         return page;
604
605 err:
606         return NULL;
607 }
608
609 static void update_and_free_page(struct hstate *h, struct page *page)
610 {
611         int i;
612
613         VM_BUG_ON(h->order >= MAX_ORDER);
614
615         h->nr_huge_pages--;
616         h->nr_huge_pages_node[page_to_nid(page)]--;
617         for (i = 0; i < pages_per_huge_page(h); i++) {
618                 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
619                                 1 << PG_referenced | 1 << PG_dirty |
620                                 1 << PG_active | 1 << PG_reserved |
621                                 1 << PG_private | 1 << PG_writeback);
622         }
623         VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
624         set_compound_page_dtor(page, NULL);
625         set_page_refcounted(page);
626         arch_release_hugepage(page);
627         __free_pages(page, huge_page_order(h));
628 }
629
630 struct hstate *size_to_hstate(unsigned long size)
631 {
632         struct hstate *h;
633
634         for_each_hstate(h) {
635                 if (huge_page_size(h) == size)
636                         return h;
637         }
638         return NULL;
639 }
640
641 static void free_huge_page(struct page *page)
642 {
643         /*
644          * Can't pass hstate in here because it is called from the
645          * compound page destructor.
646          */
647         struct hstate *h = page_hstate(page);
648         int nid = page_to_nid(page);
649         struct hugepage_subpool *spool =
650                 (struct hugepage_subpool *)page_private(page);
651         bool restore_reserve;
652
653         set_page_private(page, 0);
654         page->mapping = NULL;
655         BUG_ON(page_count(page));
656         BUG_ON(page_mapcount(page));
657         restore_reserve = PagePrivate(page);
658         ClearPagePrivate(page);
659
660         spin_lock(&hugetlb_lock);
661         hugetlb_cgroup_uncharge_page(hstate_index(h),
662                                      pages_per_huge_page(h), page);
663         if (restore_reserve)
664                 h->resv_huge_pages++;
665
666         if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
667                 /* remove the page from active list */
668                 list_del(&page->lru);
669                 update_and_free_page(h, page);
670                 h->surplus_huge_pages--;
671                 h->surplus_huge_pages_node[nid]--;
672         } else {
673                 arch_clear_hugepage_flags(page);
674                 enqueue_huge_page(h, page);
675         }
676         spin_unlock(&hugetlb_lock);
677         hugepage_subpool_put_pages(spool, 1);
678 }
679
680 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
681 {
682         INIT_LIST_HEAD(&page->lru);
683         set_compound_page_dtor(page, free_huge_page);
684         spin_lock(&hugetlb_lock);
685         set_hugetlb_cgroup(page, NULL);
686         h->nr_huge_pages++;
687         h->nr_huge_pages_node[nid]++;
688         spin_unlock(&hugetlb_lock);
689         put_page(page); /* free it into the hugepage allocator */
690 }
691
692 static void prep_compound_gigantic_page(struct page *page, unsigned long order)
693 {
694         int i;
695         int nr_pages = 1 << order;
696         struct page *p = page + 1;
697
698         /* we rely on prep_new_huge_page to set the destructor */
699         set_compound_order(page, order);
700         __SetPageHead(page);
701         __ClearPageReserved(page);
702         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
703                 __SetPageTail(p);
704                 /*
705                  * For gigantic hugepages allocated through bootmem at
706                  * boot, it's safer to be consistent with the not-gigantic
707                  * hugepages and clear the PG_reserved bit from all tail pages
708                  * too.  Otherwse drivers using get_user_pages() to access tail
709                  * pages may get the reference counting wrong if they see
710                  * PG_reserved set on a tail page (despite the head page not
711                  * having PG_reserved set).  Enforcing this consistency between
712                  * head and tail pages allows drivers to optimize away a check
713                  * on the head page when they need know if put_page() is needed
714                  * after get_user_pages().
715                  */
716                 __ClearPageReserved(p);
717                 set_page_count(p, 0);
718                 p->first_page = page;
719         }
720 }
721
722 /*
723  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
724  * transparent huge pages.  See the PageTransHuge() documentation for more
725  * details.
726  */
727 int PageHuge(struct page *page)
728 {
729         if (!PageCompound(page))
730                 return 0;
731
732         page = compound_head(page);
733         return get_compound_page_dtor(page) == free_huge_page;
734 }
735 EXPORT_SYMBOL_GPL(PageHuge);
736
737 /*
738  * PageHeadHuge() only returns true for hugetlbfs head page, but not for
739  * normal or transparent huge pages.
740  */
741 int PageHeadHuge(struct page *page_head)
742 {
743         if (!PageHead(page_head))
744                 return 0;
745
746         return get_compound_page_dtor(page_head) == free_huge_page;
747 }
748
749 pgoff_t __basepage_index(struct page *page)
750 {
751         struct page *page_head = compound_head(page);
752         pgoff_t index = page_index(page_head);
753         unsigned long compound_idx;
754
755         if (!PageHuge(page_head))
756                 return page_index(page);
757
758         if (compound_order(page_head) >= MAX_ORDER)
759                 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
760         else
761                 compound_idx = page - page_head;
762
763         return (index << compound_order(page_head)) + compound_idx;
764 }
765
766 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
767 {
768         struct page *page;
769
770         if (h->order >= MAX_ORDER)
771                 return NULL;
772
773         page = alloc_pages_exact_node(nid,
774                 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
775                                                 __GFP_REPEAT|__GFP_NOWARN,
776                 huge_page_order(h));
777         if (page) {
778                 if (arch_prepare_hugepage(page)) {
779                         __free_pages(page, huge_page_order(h));
780                         return NULL;
781                 }
782                 prep_new_huge_page(h, page, nid);
783         }
784
785         return page;
786 }
787
788 /*
789  * common helper functions for hstate_next_node_to_{alloc|free}.
790  * We may have allocated or freed a huge page based on a different
791  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
792  * be outside of *nodes_allowed.  Ensure that we use an allowed
793  * node for alloc or free.
794  */
795 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
796 {
797         nid = next_node(nid, *nodes_allowed);
798         if (nid == MAX_NUMNODES)
799                 nid = first_node(*nodes_allowed);
800         VM_BUG_ON(nid >= MAX_NUMNODES);
801
802         return nid;
803 }
804
805 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
806 {
807         if (!node_isset(nid, *nodes_allowed))
808                 nid = next_node_allowed(nid, nodes_allowed);
809         return nid;
810 }
811
812 /*
813  * returns the previously saved node ["this node"] from which to
814  * allocate a persistent huge page for the pool and advance the
815  * next node from which to allocate, handling wrap at end of node
816  * mask.
817  */
818 static int hstate_next_node_to_alloc(struct hstate *h,
819                                         nodemask_t *nodes_allowed)
820 {
821         int nid;
822
823         VM_BUG_ON(!nodes_allowed);
824
825         nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
826         h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
827
828         return nid;
829 }
830
831 /*
832  * helper for free_pool_huge_page() - return the previously saved
833  * node ["this node"] from which to free a huge page.  Advance the
834  * next node id whether or not we find a free huge page to free so
835  * that the next attempt to free addresses the next node.
836  */
837 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
838 {
839         int nid;
840
841         VM_BUG_ON(!nodes_allowed);
842
843         nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
844         h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
845
846         return nid;
847 }
848
849 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)           \
850         for (nr_nodes = nodes_weight(*mask);                            \
851                 nr_nodes > 0 &&                                         \
852                 ((node = hstate_next_node_to_alloc(hs, mask)) || 1);    \
853                 nr_nodes--)
854
855 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)            \
856         for (nr_nodes = nodes_weight(*mask);                            \
857                 nr_nodes > 0 &&                                         \
858                 ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
859                 nr_nodes--)
860
861 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
862 {
863         struct page *page;
864         int nr_nodes, node;
865         int ret = 0;
866
867         for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
868                 page = alloc_fresh_huge_page_node(h, node);
869                 if (page) {
870                         ret = 1;
871                         break;
872                 }
873         }
874
875         if (ret)
876                 count_vm_event(HTLB_BUDDY_PGALLOC);
877         else
878                 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
879
880         return ret;
881 }
882
883 /*
884  * Free huge page from pool from next node to free.
885  * Attempt to keep persistent huge pages more or less
886  * balanced over allowed nodes.
887  * Called with hugetlb_lock locked.
888  */
889 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
890                                                          bool acct_surplus)
891 {
892         int nr_nodes, node;
893         int ret = 0;
894
895         for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
896                 /*
897                  * If we're returning unused surplus pages, only examine
898                  * nodes with surplus pages.
899                  */
900                 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
901                     !list_empty(&h->hugepage_freelists[node])) {
902                         struct page *page =
903                                 list_entry(h->hugepage_freelists[node].next,
904                                           struct page, lru);
905                         list_del(&page->lru);
906                         h->free_huge_pages--;
907                         h->free_huge_pages_node[node]--;
908                         if (acct_surplus) {
909                                 h->surplus_huge_pages--;
910                                 h->surplus_huge_pages_node[node]--;
911                         }
912                         update_and_free_page(h, page);
913                         ret = 1;
914                         break;
915                 }
916         }
917
918         return ret;
919 }
920
921 /*
922  * Dissolve a given free hugepage into free buddy pages. This function does
923  * nothing for in-use (including surplus) hugepages.
924  */
925 static void dissolve_free_huge_page(struct page *page)
926 {
927         spin_lock(&hugetlb_lock);
928         if (PageHuge(page) && !page_count(page)) {
929                 struct hstate *h = page_hstate(page);
930                 int nid = page_to_nid(page);
931                 list_del(&page->lru);
932                 h->free_huge_pages--;
933                 h->free_huge_pages_node[nid]--;
934                 update_and_free_page(h, page);
935         }
936         spin_unlock(&hugetlb_lock);
937 }
938
939 /*
940  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
941  * make specified memory blocks removable from the system.
942  * Note that start_pfn should aligned with (minimum) hugepage size.
943  */
944 void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
945 {
946         unsigned int order = 8 * sizeof(void *);
947         unsigned long pfn;
948         struct hstate *h;
949
950         /* Set scan step to minimum hugepage size */
951         for_each_hstate(h)
952                 if (order > huge_page_order(h))
953                         order = huge_page_order(h);
954         VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
955         for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
956                 dissolve_free_huge_page(pfn_to_page(pfn));
957 }
958
959 static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
960 {
961         struct page *page;
962         unsigned int r_nid;
963
964         if (h->order >= MAX_ORDER)
965                 return NULL;
966
967         /*
968          * Assume we will successfully allocate the surplus page to
969          * prevent racing processes from causing the surplus to exceed
970          * overcommit
971          *
972          * This however introduces a different race, where a process B
973          * tries to grow the static hugepage pool while alloc_pages() is
974          * called by process A. B will only examine the per-node
975          * counters in determining if surplus huge pages can be
976          * converted to normal huge pages in adjust_pool_surplus(). A
977          * won't be able to increment the per-node counter, until the
978          * lock is dropped by B, but B doesn't drop hugetlb_lock until
979          * no more huge pages can be converted from surplus to normal
980          * state (and doesn't try to convert again). Thus, we have a
981          * case where a surplus huge page exists, the pool is grown, and
982          * the surplus huge page still exists after, even though it
983          * should just have been converted to a normal huge page. This
984          * does not leak memory, though, as the hugepage will be freed
985          * once it is out of use. It also does not allow the counters to
986          * go out of whack in adjust_pool_surplus() as we don't modify
987          * the node values until we've gotten the hugepage and only the
988          * per-node value is checked there.
989          */
990         spin_lock(&hugetlb_lock);
991         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
992                 spin_unlock(&hugetlb_lock);
993                 return NULL;
994         } else {
995                 h->nr_huge_pages++;
996                 h->surplus_huge_pages++;
997         }
998         spin_unlock(&hugetlb_lock);
999
1000         if (nid == NUMA_NO_NODE)
1001                 page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
1002                                    __GFP_REPEAT|__GFP_NOWARN,
1003                                    huge_page_order(h));
1004         else
1005                 page = alloc_pages_exact_node(nid,
1006                         htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1007                         __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
1008
1009         if (page && arch_prepare_hugepage(page)) {
1010                 __free_pages(page, huge_page_order(h));
1011                 page = NULL;
1012         }
1013
1014         spin_lock(&hugetlb_lock);
1015         if (page) {
1016                 INIT_LIST_HEAD(&page->lru);
1017                 r_nid = page_to_nid(page);
1018                 set_compound_page_dtor(page, free_huge_page);
1019                 set_hugetlb_cgroup(page, NULL);
1020                 /*
1021                  * We incremented the global counters already
1022                  */
1023                 h->nr_huge_pages_node[r_nid]++;
1024                 h->surplus_huge_pages_node[r_nid]++;
1025                 __count_vm_event(HTLB_BUDDY_PGALLOC);
1026         } else {
1027                 h->nr_huge_pages--;
1028                 h->surplus_huge_pages--;
1029                 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1030         }
1031         spin_unlock(&hugetlb_lock);
1032
1033         return page;
1034 }
1035
1036 /*
1037  * This allocation function is useful in the context where vma is irrelevant.
1038  * E.g. soft-offlining uses this function because it only cares physical
1039  * address of error page.
1040  */
1041 struct page *alloc_huge_page_node(struct hstate *h, int nid)
1042 {
1043         struct page *page = NULL;
1044
1045         spin_lock(&hugetlb_lock);
1046         if (h->free_huge_pages - h->resv_huge_pages > 0)
1047                 page = dequeue_huge_page_node(h, nid);
1048         spin_unlock(&hugetlb_lock);
1049
1050         if (!page)
1051                 page = alloc_buddy_huge_page(h, nid);
1052
1053         return page;
1054 }
1055
1056 /*
1057  * Increase the hugetlb pool such that it can accommodate a reservation
1058  * of size 'delta'.
1059  */
1060 static int gather_surplus_pages(struct hstate *h, int delta)
1061 {
1062         struct list_head surplus_list;
1063         struct page *page, *tmp;
1064         int ret, i;
1065         int needed, allocated;
1066         bool alloc_ok = true;
1067
1068         needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
1069         if (needed <= 0) {
1070                 h->resv_huge_pages += delta;
1071                 return 0;
1072         }
1073
1074         allocated = 0;
1075         INIT_LIST_HEAD(&surplus_list);
1076
1077         ret = -ENOMEM;
1078 retry:
1079         spin_unlock(&hugetlb_lock);
1080         for (i = 0; i < needed; i++) {
1081                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1082                 if (!page) {
1083                         alloc_ok = false;
1084                         break;
1085                 }
1086                 list_add(&page->lru, &surplus_list);
1087         }
1088         allocated += i;
1089
1090         /*
1091          * After retaking hugetlb_lock, we need to recalculate 'needed'
1092          * because either resv_huge_pages or free_huge_pages may have changed.
1093          */
1094         spin_lock(&hugetlb_lock);
1095         needed = (h->resv_huge_pages + delta) -
1096                         (h->free_huge_pages + allocated);
1097         if (needed > 0) {
1098                 if (alloc_ok)
1099                         goto retry;
1100                 /*
1101                  * We were not able to allocate enough pages to
1102                  * satisfy the entire reservation so we free what
1103                  * we've allocated so far.
1104                  */
1105                 goto free;
1106         }
1107         /*
1108          * The surplus_list now contains _at_least_ the number of extra pages
1109          * needed to accommodate the reservation.  Add the appropriate number
1110          * of pages to the hugetlb pool and free the extras back to the buddy
1111          * allocator.  Commit the entire reservation here to prevent another
1112          * process from stealing the pages as they are added to the pool but
1113          * before they are reserved.
1114          */
1115         needed += allocated;
1116         h->resv_huge_pages += delta;
1117         ret = 0;
1118
1119         /* Free the needed pages to the hugetlb pool */
1120         list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1121                 if ((--needed) < 0)
1122                         break;
1123                 /*
1124                  * This page is now managed by the hugetlb allocator and has
1125                  * no users -- drop the buddy allocator's reference.
1126                  */
1127                 put_page_testzero(page);
1128                 VM_BUG_ON_PAGE(page_count(page), page);
1129                 enqueue_huge_page(h, page);
1130         }
1131 free:
1132         spin_unlock(&hugetlb_lock);
1133
1134         /* Free unnecessary surplus pages to the buddy allocator */
1135         list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1136                 put_page(page);
1137         spin_lock(&hugetlb_lock);
1138
1139         return ret;
1140 }
1141
1142 /*
1143  * When releasing a hugetlb pool reservation, any surplus pages that were
1144  * allocated to satisfy the reservation must be explicitly freed if they were
1145  * never used.
1146  * Called with hugetlb_lock held.
1147  */
1148 static void return_unused_surplus_pages(struct hstate *h,
1149                                         unsigned long unused_resv_pages)
1150 {
1151         unsigned long nr_pages;
1152
1153         /* Uncommit the reservation */
1154         h->resv_huge_pages -= unused_resv_pages;
1155
1156         /* Cannot return gigantic pages currently */
1157         if (h->order >= MAX_ORDER)
1158                 return;
1159
1160         nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
1161
1162         /*
1163          * We want to release as many surplus pages as possible, spread
1164          * evenly across all nodes with memory. Iterate across these nodes
1165          * until we can no longer free unreserved surplus pages. This occurs
1166          * when the nodes with surplus pages have no free pages.
1167          * free_pool_huge_page() will balance the the freed pages across the
1168          * on-line nodes with memory and will handle the hstate accounting.
1169          */
1170         while (nr_pages--) {
1171                 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1172                         break;
1173         }
1174 }
1175
1176 /*
1177  * Determine if the huge page at addr within the vma has an associated
1178  * reservation.  Where it does not we will need to logically increase
1179  * reservation and actually increase subpool usage before an allocation
1180  * can occur.  Where any new reservation would be required the
1181  * reservation change is prepared, but not committed.  Once the page
1182  * has been allocated from the subpool and instantiated the change should
1183  * be committed via vma_commit_reservation.  No action is required on
1184  * failure.
1185  */
1186 static long vma_needs_reservation(struct hstate *h,
1187                         struct vm_area_struct *vma, unsigned long addr)
1188 {
1189         struct resv_map *resv;
1190         pgoff_t idx;
1191         long chg;
1192
1193         resv = vma_resv_map(vma);
1194         if (!resv)
1195                 return 1;
1196
1197         idx = vma_hugecache_offset(h, vma, addr);
1198         chg = region_chg(resv, idx, idx + 1);
1199
1200         if (vma->vm_flags & VM_MAYSHARE)
1201                 return chg;
1202         else
1203                 return chg < 0 ? chg : 0;
1204 }
1205 static void vma_commit_reservation(struct hstate *h,
1206                         struct vm_area_struct *vma, unsigned long addr)
1207 {
1208         struct resv_map *resv;
1209         pgoff_t idx;
1210
1211         resv = vma_resv_map(vma);
1212         if (!resv)
1213                 return;
1214
1215         idx = vma_hugecache_offset(h, vma, addr);
1216         region_add(resv, idx, idx + 1);
1217 }
1218
1219 static struct page *alloc_huge_page(struct vm_area_struct *vma,
1220                                     unsigned long addr, int avoid_reserve)
1221 {
1222         struct hugepage_subpool *spool = subpool_vma(vma);
1223         struct hstate *h = hstate_vma(vma);
1224         struct page *page;
1225         long chg;
1226         int ret, idx;
1227         struct hugetlb_cgroup *h_cg;
1228
1229         idx = hstate_index(h);
1230         /*
1231          * Processes that did not create the mapping will have no
1232          * reserves and will not have accounted against subpool
1233          * limit. Check that the subpool limit can be made before
1234          * satisfying the allocation MAP_NORESERVE mappings may also
1235          * need pages and subpool limit allocated allocated if no reserve
1236          * mapping overlaps.
1237          */
1238         chg = vma_needs_reservation(h, vma, addr);
1239         if (chg < 0)
1240                 return ERR_PTR(-ENOMEM);
1241         if (chg || avoid_reserve)
1242                 if (hugepage_subpool_get_pages(spool, 1))
1243                         return ERR_PTR(-ENOSPC);
1244
1245         ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1246         if (ret) {
1247                 if (chg || avoid_reserve)
1248                         hugepage_subpool_put_pages(spool, 1);
1249                 return ERR_PTR(-ENOSPC);
1250         }
1251         spin_lock(&hugetlb_lock);
1252         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
1253         if (!page) {
1254                 spin_unlock(&hugetlb_lock);
1255                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1256                 if (!page) {
1257                         hugetlb_cgroup_uncharge_cgroup(idx,
1258                                                        pages_per_huge_page(h),
1259                                                        h_cg);
1260                         if (chg || avoid_reserve)
1261                                 hugepage_subpool_put_pages(spool, 1);
1262                         return ERR_PTR(-ENOSPC);
1263                 }
1264                 spin_lock(&hugetlb_lock);
1265                 list_move(&page->lru, &h->hugepage_activelist);
1266                 /* Fall through */
1267         }
1268         hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
1269         spin_unlock(&hugetlb_lock);
1270
1271         set_page_private(page, (unsigned long)spool);
1272
1273         vma_commit_reservation(h, vma, addr);
1274         return page;
1275 }
1276
1277 /*
1278  * alloc_huge_page()'s wrapper which simply returns the page if allocation
1279  * succeeds, otherwise NULL. This function is called from new_vma_page(),
1280  * where no ERR_VALUE is expected to be returned.
1281  */
1282 struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
1283                                 unsigned long addr, int avoid_reserve)
1284 {
1285         struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
1286         if (IS_ERR(page))
1287                 page = NULL;
1288         return page;
1289 }
1290
1291 int __weak alloc_bootmem_huge_page(struct hstate *h)
1292 {
1293         struct huge_bootmem_page *m;
1294         int nr_nodes, node;
1295
1296         for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1297                 void *addr;
1298
1299                 addr = memblock_virt_alloc_try_nid_nopanic(
1300                                 huge_page_size(h), huge_page_size(h),
1301                                 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
1302                 if (addr) {
1303                         /*
1304                          * Use the beginning of the huge page to store the
1305                          * huge_bootmem_page struct (until gather_bootmem
1306                          * puts them into the mem_map).
1307                          */
1308                         m = addr;
1309                         goto found;
1310                 }
1311         }
1312         return 0;
1313
1314 found:
1315         BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
1316         /* Put them into a private list first because mem_map is not up yet */
1317         list_add(&m->list, &huge_boot_pages);
1318         m->hstate = h;
1319         return 1;
1320 }
1321
1322 static void prep_compound_huge_page(struct page *page, int order)
1323 {
1324         if (unlikely(order > (MAX_ORDER - 1)))
1325                 prep_compound_gigantic_page(page, order);
1326         else
1327                 prep_compound_page(page, order);
1328 }
1329
1330 /* Put bootmem huge pages into the standard lists after mem_map is up */
1331 static void __init gather_bootmem_prealloc(void)
1332 {
1333         struct huge_bootmem_page *m;
1334
1335         list_for_each_entry(m, &huge_boot_pages, list) {
1336                 struct hstate *h = m->hstate;
1337                 struct page *page;
1338
1339 #ifdef CONFIG_HIGHMEM
1340                 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1341                 memblock_free_late(__pa(m),
1342                                    sizeof(struct huge_bootmem_page));
1343 #else
1344                 page = virt_to_page(m);
1345 #endif
1346                 WARN_ON(page_count(page) != 1);
1347                 prep_compound_huge_page(page, h->order);
1348                 WARN_ON(PageReserved(page));
1349                 prep_new_huge_page(h, page, page_to_nid(page));
1350                 /*
1351                  * If we had gigantic hugepages allocated at boot time, we need
1352                  * to restore the 'stolen' pages to totalram_pages in order to
1353                  * fix confusing memory reports from free(1) and another
1354                  * side-effects, like CommitLimit going negative.
1355                  */
1356                 if (h->order > (MAX_ORDER - 1))
1357                         adjust_managed_page_count(page, 1 << h->order);
1358         }
1359 }
1360
1361 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1362 {
1363         unsigned long i;
1364
1365         for (i = 0; i < h->max_huge_pages; ++i) {
1366                 if (h->order >= MAX_ORDER) {
1367                         if (!alloc_bootmem_huge_page(h))
1368                                 break;
1369                 } else if (!alloc_fresh_huge_page(h,
1370                                          &node_states[N_MEMORY]))
1371                         break;
1372         }
1373         h->max_huge_pages = i;
1374 }
1375
1376 static void __init hugetlb_init_hstates(void)
1377 {
1378         struct hstate *h;
1379
1380         for_each_hstate(h) {
1381                 /* oversize hugepages were init'ed in early boot */
1382                 if (h->order < MAX_ORDER)
1383                         hugetlb_hstate_alloc_pages(h);
1384         }
1385 }
1386
1387 static char * __init memfmt(char *buf, unsigned long n)
1388 {
1389         if (n >= (1UL << 30))
1390                 sprintf(buf, "%lu GB", n >> 30);
1391         else if (n >= (1UL << 20))
1392                 sprintf(buf, "%lu MB", n >> 20);
1393         else
1394                 sprintf(buf, "%lu KB", n >> 10);
1395         return buf;
1396 }
1397
1398 static void __init report_hugepages(void)
1399 {
1400         struct hstate *h;
1401
1402         for_each_hstate(h) {
1403                 char buf[32];
1404                 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
1405                         memfmt(buf, huge_page_size(h)),
1406                         h->free_huge_pages);
1407         }
1408 }
1409
1410 #ifdef CONFIG_HIGHMEM
1411 static void try_to_free_low(struct hstate *h, unsigned long count,
1412                                                 nodemask_t *nodes_allowed)
1413 {
1414         int i;
1415
1416         if (h->order >= MAX_ORDER)
1417                 return;
1418
1419         for_each_node_mask(i, *nodes_allowed) {
1420                 struct page *page, *next;
1421                 struct list_head *freel = &h->hugepage_freelists[i];
1422                 list_for_each_entry_safe(page, next, freel, lru) {
1423                         if (count >= h->nr_huge_pages)
1424                                 return;
1425                         if (PageHighMem(page))
1426                                 continue;
1427                         list_del(&page->lru);
1428                         update_and_free_page(h, page);
1429                         h->free_huge_pages--;
1430                         h->free_huge_pages_node[page_to_nid(page)]--;
1431                 }
1432         }
1433 }
1434 #else
1435 static inline void try_to_free_low(struct hstate *h, unsigned long count,
1436                                                 nodemask_t *nodes_allowed)
1437 {
1438 }
1439 #endif
1440
1441 /*
1442  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
1443  * balanced by operating on them in a round-robin fashion.
1444  * Returns 1 if an adjustment was made.
1445  */
1446 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1447                                 int delta)
1448 {
1449         int nr_nodes, node;
1450
1451         VM_BUG_ON(delta != -1 && delta != 1);
1452
1453         if (delta < 0) {
1454                 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1455                         if (h->surplus_huge_pages_node[node])
1456                                 goto found;
1457                 }
1458         } else {
1459                 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1460                         if (h->surplus_huge_pages_node[node] <
1461                                         h->nr_huge_pages_node[node])
1462                                 goto found;
1463                 }
1464         }
1465         return 0;
1466
1467 found:
1468         h->surplus_huge_pages += delta;
1469         h->surplus_huge_pages_node[node] += delta;
1470         return 1;
1471 }
1472
1473 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1474 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1475                                                 nodemask_t *nodes_allowed)
1476 {
1477         unsigned long min_count, ret;
1478
1479         if (h->order >= MAX_ORDER)
1480                 return h->max_huge_pages;
1481
1482         /*
1483          * Increase the pool size
1484          * First take pages out of surplus state.  Then make up the
1485          * remaining difference by allocating fresh huge pages.
1486          *
1487          * We might race with alloc_buddy_huge_page() here and be unable
1488          * to convert a surplus huge page to a normal huge page. That is
1489          * not critical, though, it just means the overall size of the
1490          * pool might be one hugepage larger than it needs to be, but
1491          * within all the constraints specified by the sysctls.
1492          */
1493         spin_lock(&hugetlb_lock);
1494         while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1495                 if (!adjust_pool_surplus(h, nodes_allowed, -1))
1496                         break;
1497         }
1498
1499         while (count > persistent_huge_pages(h)) {
1500                 /*
1501                  * If this allocation races such that we no longer need the
1502                  * page, free_huge_page will handle it by freeing the page
1503                  * and reducing the surplus.
1504                  */
1505                 spin_unlock(&hugetlb_lock);
1506                 ret = alloc_fresh_huge_page(h, nodes_allowed);
1507                 spin_lock(&hugetlb_lock);
1508                 if (!ret)
1509                         goto out;
1510
1511                 /* Bail for signals. Probably ctrl-c from user */
1512                 if (signal_pending(current))
1513                         goto out;
1514         }
1515
1516         /*
1517          * Decrease the pool size
1518          * First return free pages to the buddy allocator (being careful
1519          * to keep enough around to satisfy reservations).  Then place
1520          * pages into surplus state as needed so the pool will shrink
1521          * to the desired size as pages become free.
1522          *
1523          * By placing pages into the surplus state independent of the
1524          * overcommit value, we are allowing the surplus pool size to
1525          * exceed overcommit. There are few sane options here. Since
1526          * alloc_buddy_huge_page() is checking the global counter,
1527          * though, we'll note that we're not allowed to exceed surplus
1528          * and won't grow the pool anywhere else. Not until one of the
1529          * sysctls are changed, or the surplus pages go out of use.
1530          */
1531         min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1532         min_count = max(count, min_count);
1533         try_to_free_low(h, min_count, nodes_allowed);
1534         while (min_count < persistent_huge_pages(h)) {
1535                 if (!free_pool_huge_page(h, nodes_allowed, 0))
1536                         break;
1537         }
1538         while (count < persistent_huge_pages(h)) {
1539                 if (!adjust_pool_surplus(h, nodes_allowed, 1))
1540                         break;
1541         }
1542 out:
1543         ret = persistent_huge_pages(h);
1544         spin_unlock(&hugetlb_lock);
1545         return ret;
1546 }
1547
1548 #define HSTATE_ATTR_RO(_name) \
1549         static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1550
1551 #define HSTATE_ATTR(_name) \
1552         static struct kobj_attribute _name##_attr = \
1553                 __ATTR(_name, 0644, _name##_show, _name##_store)
1554
1555 static struct kobject *hugepages_kobj;
1556 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1557
1558 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1559
1560 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1561 {
1562         int i;
1563
1564         for (i = 0; i < HUGE_MAX_HSTATE; i++)
1565                 if (hstate_kobjs[i] == kobj) {
1566                         if (nidp)
1567                                 *nidp = NUMA_NO_NODE;
1568                         return &hstates[i];
1569                 }
1570
1571         return kobj_to_node_hstate(kobj, nidp);
1572 }
1573
1574 static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1575                                         struct kobj_attribute *attr, char *buf)
1576 {
1577         struct hstate *h;
1578         unsigned long nr_huge_pages;
1579         int nid;
1580
1581         h = kobj_to_hstate(kobj, &nid);
1582         if (nid == NUMA_NO_NODE)
1583                 nr_huge_pages = h->nr_huge_pages;
1584         else
1585                 nr_huge_pages = h->nr_huge_pages_node[nid];
1586
1587         return sprintf(buf, "%lu\n", nr_huge_pages);
1588 }
1589
1590 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1591                         struct kobject *kobj, struct kobj_attribute *attr,
1592                         const char *buf, size_t len)
1593 {
1594         int err;
1595         int nid;
1596         unsigned long count;
1597         struct hstate *h;
1598         NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1599
1600         err = kstrtoul(buf, 10, &count);
1601         if (err)
1602                 goto out;
1603
1604         h = kobj_to_hstate(kobj, &nid);
1605         if (h->order >= MAX_ORDER) {
1606                 err = -EINVAL;
1607                 goto out;
1608         }
1609
1610         if (nid == NUMA_NO_NODE) {
1611                 /*
1612                  * global hstate attribute
1613                  */
1614                 if (!(obey_mempolicy &&
1615                                 init_nodemask_of_mempolicy(nodes_allowed))) {
1616                         NODEMASK_FREE(nodes_allowed);
1617                         nodes_allowed = &node_states[N_MEMORY];
1618                 }
1619         } else if (nodes_allowed) {
1620                 /*
1621                  * per node hstate attribute: adjust count to global,
1622                  * but restrict alloc/free to the specified node.
1623                  */
1624                 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1625                 init_nodemask_of_node(nodes_allowed, nid);
1626         } else
1627                 nodes_allowed = &node_states[N_MEMORY];
1628
1629         h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1630
1631         if (nodes_allowed != &node_states[N_MEMORY])
1632                 NODEMASK_FREE(nodes_allowed);
1633
1634         return len;
1635 out:
1636         NODEMASK_FREE(nodes_allowed);
1637         return err;
1638 }
1639
1640 static ssize_t nr_hugepages_show(struct kobject *kobj,
1641                                        struct kobj_attribute *attr, char *buf)
1642 {
1643         return nr_hugepages_show_common(kobj, attr, buf);
1644 }
1645
1646 static ssize_t nr_hugepages_store(struct kobject *kobj,
1647                struct kobj_attribute *attr, const char *buf, size_t len)
1648 {
1649         return nr_hugepages_store_common(false, kobj, attr, buf, len);
1650 }
1651 HSTATE_ATTR(nr_hugepages);
1652
1653 #ifdef CONFIG_NUMA
1654
1655 /*
1656  * hstate attribute for optionally mempolicy-based constraint on persistent
1657  * huge page alloc/free.
1658  */
1659 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1660                                        struct kobj_attribute *attr, char *buf)
1661 {
1662         return nr_hugepages_show_common(kobj, attr, buf);
1663 }
1664
1665 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1666                struct kobj_attribute *attr, const char *buf, size_t len)
1667 {
1668         return nr_hugepages_store_common(true, kobj, attr, buf, len);
1669 }
1670 HSTATE_ATTR(nr_hugepages_mempolicy);
1671 #endif
1672
1673
1674 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1675                                         struct kobj_attribute *attr, char *buf)
1676 {
1677         struct hstate *h = kobj_to_hstate(kobj, NULL);
1678         return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1679 }
1680
1681 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1682                 struct kobj_attribute *attr, const char *buf, size_t count)
1683 {
1684         int err;
1685         unsigned long input;
1686         struct hstate *h = kobj_to_hstate(kobj, NULL);
1687
1688         if (h->order >= MAX_ORDER)
1689                 return -EINVAL;
1690
1691         err = kstrtoul(buf, 10, &input);
1692         if (err)
1693                 return err;
1694
1695         spin_lock(&hugetlb_lock);
1696         h->nr_overcommit_huge_pages = input;
1697         spin_unlock(&hugetlb_lock);
1698
1699         return count;
1700 }
1701 HSTATE_ATTR(nr_overcommit_hugepages);
1702
1703 static ssize_t free_hugepages_show(struct kobject *kobj,
1704                                         struct kobj_attribute *attr, char *buf)
1705 {
1706         struct hstate *h;
1707         unsigned long free_huge_pages;
1708         int nid;
1709
1710         h = kobj_to_hstate(kobj, &nid);
1711         if (nid == NUMA_NO_NODE)
1712                 free_huge_pages = h->free_huge_pages;
1713         else
1714                 free_huge_pages = h->free_huge_pages_node[nid];
1715
1716         return sprintf(buf, "%lu\n", free_huge_pages);
1717 }
1718 HSTATE_ATTR_RO(free_hugepages);
1719
1720 static ssize_t resv_hugepages_show(struct kobject *kobj,
1721                                         struct kobj_attribute *attr, char *buf)
1722 {
1723         struct hstate *h = kobj_to_hstate(kobj, NULL);
1724         return sprintf(buf, "%lu\n", h->resv_huge_pages);
1725 }
1726 HSTATE_ATTR_RO(resv_hugepages);
1727
1728 static ssize_t surplus_hugepages_show(struct kobject *kobj,
1729                                         struct kobj_attribute *attr, char *buf)
1730 {
1731         struct hstate *h;
1732         unsigned long surplus_huge_pages;
1733         int nid;
1734
1735         h = kobj_to_hstate(kobj, &nid);
1736         if (nid == NUMA_NO_NODE)
1737                 surplus_huge_pages = h->surplus_huge_pages;
1738         else
1739                 surplus_huge_pages = h->surplus_huge_pages_node[nid];
1740
1741         return sprintf(buf, "%lu\n", surplus_huge_pages);
1742 }
1743 HSTATE_ATTR_RO(surplus_hugepages);
1744
1745 static struct attribute *hstate_attrs[] = {
1746         &nr_hugepages_attr.attr,
1747         &nr_overcommit_hugepages_attr.attr,
1748         &free_hugepages_attr.attr,
1749         &resv_hugepages_attr.attr,
1750         &surplus_hugepages_attr.attr,
1751 #ifdef CONFIG_NUMA
1752         &nr_hugepages_mempolicy_attr.attr,
1753 #endif
1754         NULL,
1755 };
1756
1757 static struct attribute_group hstate_attr_group = {
1758         .attrs = hstate_attrs,
1759 };
1760
1761 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1762                                     struct kobject **hstate_kobjs,
1763                                     struct attribute_group *hstate_attr_group)
1764 {
1765         int retval;
1766         int hi = hstate_index(h);
1767
1768         hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1769         if (!hstate_kobjs[hi])
1770                 return -ENOMEM;
1771
1772         retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1773         if (retval)
1774                 kobject_put(hstate_kobjs[hi]);
1775
1776         return retval;
1777 }
1778
1779 static void __init hugetlb_sysfs_init(void)
1780 {
1781         struct hstate *h;
1782         int err;
1783
1784         hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1785         if (!hugepages_kobj)
1786                 return;
1787
1788         for_each_hstate(h) {
1789                 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1790                                          hstate_kobjs, &hstate_attr_group);
1791                 if (err)
1792                         pr_err("Hugetlb: Unable to add hstate %s", h->name);
1793         }
1794 }
1795
1796 #ifdef CONFIG_NUMA
1797
1798 /*
1799  * node_hstate/s - associate per node hstate attributes, via their kobjects,
1800  * with node devices in node_devices[] using a parallel array.  The array
1801  * index of a node device or _hstate == node id.
1802  * This is here to avoid any static dependency of the node device driver, in
1803  * the base kernel, on the hugetlb module.
1804  */
1805 struct node_hstate {
1806         struct kobject          *hugepages_kobj;
1807         struct kobject          *hstate_kobjs[HUGE_MAX_HSTATE];
1808 };
1809 struct node_hstate node_hstates[MAX_NUMNODES];
1810
1811 /*
1812  * A subset of global hstate attributes for node devices
1813  */
1814 static struct attribute *per_node_hstate_attrs[] = {
1815         &nr_hugepages_attr.attr,
1816         &free_hugepages_attr.attr,
1817         &surplus_hugepages_attr.attr,
1818         NULL,
1819 };
1820
1821 static struct attribute_group per_node_hstate_attr_group = {
1822         .attrs = per_node_hstate_attrs,
1823 };
1824
1825 /*
1826  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
1827  * Returns node id via non-NULL nidp.
1828  */
1829 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1830 {
1831         int nid;
1832
1833         for (nid = 0; nid < nr_node_ids; nid++) {
1834                 struct node_hstate *nhs = &node_hstates[nid];
1835                 int i;
1836                 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1837                         if (nhs->hstate_kobjs[i] == kobj) {
1838                                 if (nidp)
1839                                         *nidp = nid;
1840                                 return &hstates[i];
1841                         }
1842         }
1843
1844         BUG();
1845         return NULL;
1846 }
1847
1848 /*
1849  * Unregister hstate attributes from a single node device.
1850  * No-op if no hstate attributes attached.
1851  */
1852 static void hugetlb_unregister_node(struct node *node)
1853 {
1854         struct hstate *h;
1855         struct node_hstate *nhs = &node_hstates[node->dev.id];
1856
1857         if (!nhs->hugepages_kobj)
1858                 return;         /* no hstate attributes */
1859
1860         for_each_hstate(h) {
1861                 int idx = hstate_index(h);
1862                 if (nhs->hstate_kobjs[idx]) {
1863                         kobject_put(nhs->hstate_kobjs[idx]);
1864                         nhs->hstate_kobjs[idx] = NULL;
1865                 }
1866         }
1867
1868         kobject_put(nhs->hugepages_kobj);
1869         nhs->hugepages_kobj = NULL;
1870 }
1871
1872 /*
1873  * hugetlb module exit:  unregister hstate attributes from node devices
1874  * that have them.
1875  */
1876 static void hugetlb_unregister_all_nodes(void)
1877 {
1878         int nid;
1879
1880         /*
1881          * disable node device registrations.
1882          */
1883         register_hugetlbfs_with_node(NULL, NULL);
1884
1885         /*
1886          * remove hstate attributes from any nodes that have them.
1887          */
1888         for (nid = 0; nid < nr_node_ids; nid++)
1889                 hugetlb_unregister_node(node_devices[nid]);
1890 }
1891
1892 /*
1893  * Register hstate attributes for a single node device.
1894  * No-op if attributes already registered.
1895  */
1896 static void hugetlb_register_node(struct node *node)
1897 {
1898         struct hstate *h;
1899         struct node_hstate *nhs = &node_hstates[node->dev.id];
1900         int err;
1901
1902         if (nhs->hugepages_kobj)
1903                 return;         /* already allocated */
1904
1905         nhs->hugepages_kobj = kobject_create_and_add("hugepages",
1906                                                         &node->dev.kobj);
1907         if (!nhs->hugepages_kobj)
1908                 return;
1909
1910         for_each_hstate(h) {
1911                 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
1912                                                 nhs->hstate_kobjs,
1913                                                 &per_node_hstate_attr_group);
1914                 if (err) {
1915                         pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
1916                                 h->name, node->dev.id);
1917                         hugetlb_unregister_node(node);
1918                         break;
1919                 }
1920         }
1921 }
1922
1923 /*
1924  * hugetlb init time:  register hstate attributes for all registered node
1925  * devices of nodes that have memory.  All on-line nodes should have
1926  * registered their associated device by this time.
1927  */
1928 static void hugetlb_register_all_nodes(void)
1929 {
1930         int nid;
1931
1932         for_each_node_state(nid, N_MEMORY) {
1933                 struct node *node = node_devices[nid];
1934                 if (node->dev.id == nid)
1935                         hugetlb_register_node(node);
1936         }
1937
1938         /*
1939          * Let the node device driver know we're here so it can
1940          * [un]register hstate attributes on node hotplug.
1941          */
1942         register_hugetlbfs_with_node(hugetlb_register_node,
1943                                      hugetlb_unregister_node);
1944 }
1945 #else   /* !CONFIG_NUMA */
1946
1947 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1948 {
1949         BUG();
1950         if (nidp)
1951                 *nidp = -1;
1952         return NULL;
1953 }
1954
1955 static void hugetlb_unregister_all_nodes(void) { }
1956
1957 static void hugetlb_register_all_nodes(void) { }
1958
1959 #endif
1960
1961 static void __exit hugetlb_exit(void)
1962 {
1963         struct hstate *h;
1964
1965         hugetlb_unregister_all_nodes();
1966
1967         for_each_hstate(h) {
1968                 kobject_put(hstate_kobjs[hstate_index(h)]);
1969         }
1970
1971         kobject_put(hugepages_kobj);
1972         kfree(htlb_fault_mutex_table);
1973 }
1974 module_exit(hugetlb_exit);
1975
1976 static int __init hugetlb_init(void)
1977 {
1978         int i;
1979
1980         /* Some platform decide whether they support huge pages at boot
1981          * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
1982          * there is no such support
1983          */
1984         if (HPAGE_SHIFT == 0)
1985                 return 0;
1986
1987         if (!size_to_hstate(default_hstate_size)) {
1988                 default_hstate_size = HPAGE_SIZE;
1989                 if (!size_to_hstate(default_hstate_size))
1990                         hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1991         }
1992         default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
1993         if (default_hstate_max_huge_pages)
1994                 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1995
1996         hugetlb_init_hstates();
1997         gather_bootmem_prealloc();
1998         report_hugepages();
1999
2000         hugetlb_sysfs_init();
2001         hugetlb_register_all_nodes();
2002         hugetlb_cgroup_file_init();
2003
2004 #ifdef CONFIG_SMP
2005         num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
2006 #else
2007         num_fault_mutexes = 1;
2008 #endif
2009         htlb_fault_mutex_table =
2010                 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
2011         BUG_ON(!htlb_fault_mutex_table);
2012
2013         for (i = 0; i < num_fault_mutexes; i++)
2014                 mutex_init(&htlb_fault_mutex_table[i]);
2015         return 0;
2016 }
2017 module_init(hugetlb_init);
2018
2019 /* Should be called on processing a hugepagesz=... option */
2020 void __init hugetlb_add_hstate(unsigned order)
2021 {
2022         struct hstate *h;
2023         unsigned long i;
2024
2025         if (size_to_hstate(PAGE_SIZE << order)) {
2026                 pr_warning("hugepagesz= specified twice, ignoring\n");
2027                 return;
2028         }
2029         BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
2030         BUG_ON(order == 0);
2031         h = &hstates[hugetlb_max_hstate++];
2032         h->order = order;
2033         h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
2034         h->nr_huge_pages = 0;
2035         h->free_huge_pages = 0;
2036         for (i = 0; i < MAX_NUMNODES; ++i)
2037                 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
2038         INIT_LIST_HEAD(&h->hugepage_activelist);
2039         h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
2040         h->next_nid_to_free = first_node(node_states[N_MEMORY]);
2041         snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
2042                                         huge_page_size(h)/1024);
2043
2044         parsed_hstate = h;
2045 }
2046
2047 static int __init hugetlb_nrpages_setup(char *s)
2048 {
2049         unsigned long *mhp;
2050         static unsigned long *last_mhp;
2051
2052         /*
2053          * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
2054          * so this hugepages= parameter goes to the "default hstate".
2055          */
2056         if (!hugetlb_max_hstate)
2057                 mhp = &default_hstate_max_huge_pages;
2058         else
2059                 mhp = &parsed_hstate->max_huge_pages;
2060
2061         if (mhp == last_mhp) {
2062                 pr_warning("hugepages= specified twice without "
2063                            "interleaving hugepagesz=, ignoring\n");
2064                 return 1;
2065         }
2066
2067         if (sscanf(s, "%lu", mhp) <= 0)
2068                 *mhp = 0;
2069
2070         /*
2071          * Global state is always initialized later in hugetlb_init.
2072          * But we need to allocate >= MAX_ORDER hstates here early to still
2073          * use the bootmem allocator.
2074          */
2075         if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
2076                 hugetlb_hstate_alloc_pages(parsed_hstate);
2077
2078         last_mhp = mhp;
2079
2080         return 1;
2081 }
2082 __setup("hugepages=", hugetlb_nrpages_setup);
2083
2084 static int __init hugetlb_default_setup(char *s)
2085 {
2086         default_hstate_size = memparse(s, &s);
2087         return 1;
2088 }
2089 __setup("default_hugepagesz=", hugetlb_default_setup);
2090
2091 static unsigned int cpuset_mems_nr(unsigned int *array)
2092 {
2093         int node;
2094         unsigned int nr = 0;
2095
2096         for_each_node_mask(node, cpuset_current_mems_allowed)
2097                 nr += array[node];
2098
2099         return nr;
2100 }
2101
2102 #ifdef CONFIG_SYSCTL
2103 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2104                          struct ctl_table *table, int write,
2105                          void __user *buffer, size_t *length, loff_t *ppos)
2106 {
2107         struct hstate *h = &default_hstate;
2108         unsigned long tmp;
2109         int ret;
2110
2111         tmp = h->max_huge_pages;
2112
2113         if (write && h->order >= MAX_ORDER)
2114                 return -EINVAL;
2115
2116         table->data = &tmp;
2117         table->maxlen = sizeof(unsigned long);
2118         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
2119         if (ret)
2120                 goto out;
2121
2122         if (write) {
2123                 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
2124                                                 GFP_KERNEL | __GFP_NORETRY);
2125                 if (!(obey_mempolicy &&
2126                                init_nodemask_of_mempolicy(nodes_allowed))) {
2127                         NODEMASK_FREE(nodes_allowed);
2128                         nodes_allowed = &node_states[N_MEMORY];
2129                 }
2130                 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
2131
2132                 if (nodes_allowed != &node_states[N_MEMORY])
2133                         NODEMASK_FREE(nodes_allowed);
2134         }
2135 out:
2136         return ret;
2137 }
2138
2139 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
2140                           void __user *buffer, size_t *length, loff_t *ppos)
2141 {
2142
2143         return hugetlb_sysctl_handler_common(false, table, write,
2144                                                         buffer, length, ppos);
2145 }
2146
2147 #ifdef CONFIG_NUMA
2148 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
2149                           void __user *buffer, size_t *length, loff_t *ppos)
2150 {
2151         return hugetlb_sysctl_handler_common(true, table, write,
2152                                                         buffer, length, ppos);
2153 }
2154 #endif /* CONFIG_NUMA */
2155
2156 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
2157                         void __user *buffer,
2158                         size_t *length, loff_t *ppos)
2159 {
2160         struct hstate *h = &default_hstate;
2161         unsigned long tmp;
2162         int ret;
2163
2164         tmp = h->nr_overcommit_huge_pages;
2165
2166         if (write && h->order >= MAX_ORDER)
2167                 return -EINVAL;
2168
2169         table->data = &tmp;
2170         table->maxlen = sizeof(unsigned long);
2171         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
2172         if (ret)
2173                 goto out;
2174
2175         if (write) {
2176                 spin_lock(&hugetlb_lock);
2177                 h->nr_overcommit_huge_pages = tmp;
2178                 spin_unlock(&hugetlb_lock);
2179         }
2180 out:
2181         return ret;
2182 }
2183
2184 #endif /* CONFIG_SYSCTL */
2185
2186 void hugetlb_report_meminfo(struct seq_file *m)
2187 {
2188         struct hstate *h = &default_hstate;
2189         seq_printf(m,
2190                         "HugePages_Total:   %5lu\n"
2191                         "HugePages_Free:    %5lu\n"
2192                         "HugePages_Rsvd:    %5lu\n"
2193                         "HugePages_Surp:    %5lu\n"
2194                         "Hugepagesize:   %8lu kB\n",
2195                         h->nr_huge_pages,
2196                         h->free_huge_pages,
2197                         h->resv_huge_pages,
2198                         h->surplus_huge_pages,
2199                         1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2200 }
2201
2202 int hugetlb_report_node_meminfo(int nid, char *buf)
2203 {
2204         struct hstate *h = &default_hstate;
2205         return sprintf(buf,
2206                 "Node %d HugePages_Total: %5u\n"
2207                 "Node %d HugePages_Free:  %5u\n"
2208                 "Node %d HugePages_Surp:  %5u\n",
2209                 nid, h->nr_huge_pages_node[nid],
2210                 nid, h->free_huge_pages_node[nid],
2211                 nid, h->surplus_huge_pages_node[nid]);
2212 }
2213
2214 void hugetlb_show_meminfo(void)
2215 {
2216         struct hstate *h;
2217         int nid;
2218
2219         for_each_node_state(nid, N_MEMORY)
2220                 for_each_hstate(h)
2221                         pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
2222                                 nid,
2223                                 h->nr_huge_pages_node[nid],
2224                                 h->free_huge_pages_node[nid],
2225                                 h->surplus_huge_pages_node[nid],
2226                                 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2227 }
2228
2229 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
2230 unsigned long hugetlb_total_pages(void)
2231 {
2232         struct hstate *h;
2233         unsigned long nr_total_pages = 0;
2234
2235         for_each_hstate(h)
2236                 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
2237         return nr_total_pages;
2238 }
2239
2240 static int hugetlb_acct_memory(struct hstate *h, long delta)
2241 {
2242         int ret = -ENOMEM;
2243
2244         spin_lock(&hugetlb_lock);
2245         /*
2246          * When cpuset is configured, it breaks the strict hugetlb page
2247          * reservation as the accounting is done on a global variable. Such
2248          * reservation is completely rubbish in the presence of cpuset because
2249          * the reservation is not checked against page availability for the
2250          * current cpuset. Application can still potentially OOM'ed by kernel
2251          * with lack of free htlb page in cpuset that the task is in.
2252          * Attempt to enforce strict accounting with cpuset is almost
2253          * impossible (or too ugly) because cpuset is too fluid that
2254          * task or memory node can be dynamically moved between cpusets.
2255          *
2256          * The change of semantics for shared hugetlb mapping with cpuset is
2257          * undesirable. However, in order to preserve some of the semantics,
2258          * we fall back to check against current free page availability as
2259          * a best attempt and hopefully to minimize the impact of changing
2260          * semantics that cpuset has.
2261          */
2262         if (delta > 0) {
2263                 if (gather_surplus_pages(h, delta) < 0)
2264                         goto out;
2265
2266                 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2267                         return_unused_surplus_pages(h, delta);
2268                         goto out;
2269                 }
2270         }
2271
2272         ret = 0;
2273         if (delta < 0)
2274                 return_unused_surplus_pages(h, (unsigned long) -delta);
2275
2276 out:
2277         spin_unlock(&hugetlb_lock);
2278         return ret;
2279 }
2280
2281 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2282 {
2283         struct resv_map *resv = vma_resv_map(vma);
2284
2285         /*
2286          * This new VMA should share its siblings reservation map if present.
2287          * The VMA will only ever have a valid reservation map pointer where
2288          * it is being copied for another still existing VMA.  As that VMA
2289          * has a reference to the reservation map it cannot disappear until
2290          * after this open call completes.  It is therefore safe to take a
2291          * new reference here without additional locking.
2292          */
2293         if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2294                 kref_get(&resv->refs);
2295 }
2296
2297 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2298 {
2299         struct hstate *h = hstate_vma(vma);
2300         struct resv_map *resv = vma_resv_map(vma);
2301         struct hugepage_subpool *spool = subpool_vma(vma);
2302         unsigned long reserve, start, end;
2303
2304         if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2305                 return;
2306
2307         start = vma_hugecache_offset(h, vma, vma->vm_start);
2308         end = vma_hugecache_offset(h, vma, vma->vm_end);
2309
2310         reserve = (end - start) - region_count(resv, start, end);
2311
2312         kref_put(&resv->refs, resv_map_release);
2313
2314         if (reserve) {
2315                 hugetlb_acct_memory(h, -reserve);
2316                 hugepage_subpool_put_pages(spool, reserve);
2317         }
2318 }
2319
2320 /*
2321  * We cannot handle pagefaults against hugetlb pages at all.  They cause
2322  * handle_mm_fault() to try to instantiate regular-sized pages in the
2323  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
2324  * this far.
2325  */
2326 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2327 {
2328         BUG();
2329         return 0;
2330 }
2331
2332 const struct vm_operations_struct hugetlb_vm_ops = {
2333         .fault = hugetlb_vm_op_fault,
2334         .open = hugetlb_vm_op_open,
2335         .close = hugetlb_vm_op_close,
2336 };
2337
2338 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
2339                                 int writable)
2340 {
2341         pte_t entry;
2342
2343         if (writable) {
2344                 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
2345                                          vma->vm_page_prot)));
2346         } else {
2347                 entry = huge_pte_wrprotect(mk_huge_pte(page,
2348                                            vma->vm_page_prot));
2349         }
2350         entry = pte_mkyoung(entry);
2351         entry = pte_mkhuge(entry);
2352         entry = arch_make_huge_pte(entry, vma, page, writable);
2353
2354         return entry;
2355 }
2356
2357 static void set_huge_ptep_writable(struct vm_area_struct *vma,
2358                                    unsigned long address, pte_t *ptep)
2359 {
2360         pte_t entry;
2361
2362         entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
2363         if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
2364                 update_mmu_cache(vma, address, ptep);
2365 }
2366
2367
2368 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2369                             struct vm_area_struct *vma)
2370 {
2371         pte_t *src_pte, *dst_pte, entry;
2372         struct page *ptepage;
2373         unsigned long addr;
2374         int cow;
2375         struct hstate *h = hstate_vma(vma);
2376         unsigned long sz = huge_page_size(h);
2377         unsigned long mmun_start;       /* For mmu_notifiers */
2378         unsigned long mmun_end;         /* For mmu_notifiers */
2379         int ret = 0;
2380
2381         cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2382
2383         mmun_start = vma->vm_start;
2384         mmun_end = vma->vm_end;
2385         if (cow)
2386                 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
2387
2388         for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2389                 spinlock_t *src_ptl, *dst_ptl;
2390                 src_pte = huge_pte_offset(src, addr);
2391                 if (!src_pte)
2392                         continue;
2393                 dst_pte = huge_pte_alloc(dst, addr, sz);
2394                 if (!dst_pte) {
2395                         ret = -ENOMEM;
2396                         break;
2397                 }
2398
2399                 /* If the pagetables are shared don't copy or take references */
2400                 if (dst_pte == src_pte)
2401                         continue;
2402
2403                 dst_ptl = huge_pte_lock(h, dst, dst_pte);
2404                 src_ptl = huge_pte_lockptr(h, src, src_pte);
2405                 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
2406                 if (!huge_pte_none(huge_ptep_get(src_pte))) {
2407                         if (cow)
2408                                 huge_ptep_set_wrprotect(src, addr, src_pte);
2409                         entry = huge_ptep_get(src_pte);
2410                         ptepage = pte_page(entry);
2411                         get_page(ptepage);
2412                         page_dup_rmap(ptepage);
2413                         set_huge_pte_at(dst, addr, dst_pte, entry);
2414                 }
2415                 spin_unlock(src_ptl);
2416                 spin_unlock(dst_ptl);
2417         }
2418
2419         if (cow)
2420                 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
2421
2422         return ret;
2423 }
2424
2425 static int is_hugetlb_entry_migration(pte_t pte)
2426 {
2427         swp_entry_t swp;
2428
2429         if (huge_pte_none(pte) || pte_present(pte))
2430                 return 0;
2431         swp = pte_to_swp_entry(pte);
2432         if (non_swap_entry(swp) && is_migration_entry(swp))
2433                 return 1;
2434         else
2435                 return 0;
2436 }
2437
2438 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2439 {
2440         swp_entry_t swp;
2441
2442         if (huge_pte_none(pte) || pte_present(pte))
2443                 return 0;
2444         swp = pte_to_swp_entry(pte);
2445         if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2446                 return 1;
2447         else
2448                 return 0;
2449 }
2450
2451 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2452                             unsigned long start, unsigned long end,
2453                             struct page *ref_page)
2454 {
2455         int force_flush = 0;
2456         struct mm_struct *mm = vma->vm_mm;
2457         unsigned long address;
2458         pte_t *ptep;
2459         pte_t pte;
2460         spinlock_t *ptl;
2461         struct page *page;
2462         struct hstate *h = hstate_vma(vma);
2463         unsigned long sz = huge_page_size(h);
2464         const unsigned long mmun_start = start; /* For mmu_notifiers */
2465         const unsigned long mmun_end   = end;   /* For mmu_notifiers */
2466
2467         WARN_ON(!is_vm_hugetlb_page(vma));
2468         BUG_ON(start & ~huge_page_mask(h));
2469         BUG_ON(end & ~huge_page_mask(h));
2470
2471         tlb_start_vma(tlb, vma);
2472         mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2473 again:
2474         for (address = start; address < end; address += sz) {
2475                 ptep = huge_pte_offset(mm, address);
2476                 if (!ptep)
2477                         continue;
2478
2479                 ptl = huge_pte_lock(h, mm, ptep);
2480                 if (huge_pmd_unshare(mm, &address, ptep))
2481                         goto unlock;
2482
2483                 pte = huge_ptep_get(ptep);
2484                 if (huge_pte_none(pte))
2485                         goto unlock;
2486
2487                 /*
2488                  * HWPoisoned hugepage is already unmapped and dropped reference
2489                  */
2490                 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
2491                         huge_pte_clear(mm, address, ptep);
2492                         goto unlock;
2493                 }
2494
2495                 page = pte_page(pte);
2496                 /*
2497                  * If a reference page is supplied, it is because a specific
2498                  * page is being unmapped, not a range. Ensure the page we
2499                  * are about to unmap is the actual page of interest.
2500                  */
2501                 if (ref_page) {
2502                         if (page != ref_page)
2503                                 goto unlock;
2504
2505                         /*
2506                          * Mark the VMA as having unmapped its page so that
2507                          * future faults in this VMA will fail rather than
2508                          * looking like data was lost
2509                          */
2510                         set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
2511                 }
2512
2513                 pte = huge_ptep_get_and_clear(mm, address, ptep);
2514                 tlb_remove_tlb_entry(tlb, ptep, address);
2515                 if (huge_pte_dirty(pte))
2516                         set_page_dirty(page);
2517
2518                 page_remove_rmap(page);
2519                 force_flush = !__tlb_remove_page(tlb, page);
2520                 if (force_flush) {
2521                         spin_unlock(ptl);
2522                         break;
2523                 }
2524                 /* Bail out after unmapping reference page if supplied */
2525                 if (ref_page) {
2526                         spin_unlock(ptl);
2527                         break;
2528                 }
2529 unlock:
2530                 spin_unlock(ptl);
2531         }
2532         /*
2533          * mmu_gather ran out of room to batch pages, we break out of
2534          * the PTE lock to avoid doing the potential expensive TLB invalidate
2535          * and page-free while holding it.
2536          */
2537         if (force_flush) {
2538                 force_flush = 0;
2539                 tlb_flush_mmu(tlb);
2540                 if (address < end && !ref_page)
2541                         goto again;
2542         }
2543         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2544         tlb_end_vma(tlb, vma);
2545 }
2546
2547 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2548                           struct vm_area_struct *vma, unsigned long start,
2549                           unsigned long end, struct page *ref_page)
2550 {
2551         __unmap_hugepage_range(tlb, vma, start, end, ref_page);
2552
2553         /*
2554          * Clear this flag so that x86's huge_pmd_share page_table_shareable
2555          * test will fail on a vma being torn down, and not grab a page table
2556          * on its way out.  We're lucky that the flag has such an appropriate
2557          * name, and can in fact be safely cleared here. We could clear it
2558          * before the __unmap_hugepage_range above, but all that's necessary
2559          * is to clear it before releasing the i_mmap_mutex. This works
2560          * because in the context this is called, the VMA is about to be
2561          * destroyed and the i_mmap_mutex is held.
2562          */
2563         vma->vm_flags &= ~VM_MAYSHARE;
2564 }
2565
2566 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2567                           unsigned long end, struct page *ref_page)
2568 {
2569         struct mm_struct *mm;
2570         struct mmu_gather tlb;
2571
2572         mm = vma->vm_mm;
2573
2574         tlb_gather_mmu(&tlb, mm, start, end);
2575         __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
2576         tlb_finish_mmu(&tlb, start, end);
2577 }
2578
2579 /*
2580  * This is called when the original mapper is failing to COW a MAP_PRIVATE
2581  * mappping it owns the reserve page for. The intention is to unmap the page
2582  * from other VMAs and let the children be SIGKILLed if they are faulting the
2583  * same region.
2584  */
2585 static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2586                                 struct page *page, unsigned long address)
2587 {
2588         struct hstate *h = hstate_vma(vma);
2589         struct vm_area_struct *iter_vma;
2590         struct address_space *mapping;
2591         pgoff_t pgoff;
2592
2593         /*
2594          * vm_pgoff is in PAGE_SIZE units, hence the different calculation
2595          * from page cache lookup which is in HPAGE_SIZE units.
2596          */
2597         address = address & huge_page_mask(h);
2598         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
2599                         vma->vm_pgoff;
2600         mapping = file_inode(vma->vm_file)->i_mapping;
2601
2602         /*
2603          * Take the mapping lock for the duration of the table walk. As
2604          * this mapping should be shared between all the VMAs,
2605          * __unmap_hugepage_range() is called as the lock is already held
2606          */
2607         mutex_lock(&mapping->i_mmap_mutex);
2608         vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
2609                 /* Do not unmap the current VMA */
2610                 if (iter_vma == vma)
2611                         continue;
2612
2613                 /*
2614                  * Unmap the page from other VMAs without their own reserves.
2615                  * They get marked to be SIGKILLed if they fault in these
2616                  * areas. This is because a future no-page fault on this VMA
2617                  * could insert a zeroed page instead of the data existing
2618                  * from the time of fork. This would look like data corruption
2619                  */
2620                 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2621                         unmap_hugepage_range(iter_vma, address,
2622                                              address + huge_page_size(h), page);
2623         }
2624         mutex_unlock(&mapping->i_mmap_mutex);
2625
2626         return 1;
2627 }
2628
2629 /*
2630  * Hugetlb_cow() should be called with page lock of the original hugepage held.
2631  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
2632  * cannot race with other handlers or page migration.
2633  * Keep the pte_same checks anyway to make transition from the mutex easier.
2634  */
2635 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2636                         unsigned long address, pte_t *ptep, pte_t pte,
2637                         struct page *pagecache_page, spinlock_t *ptl)
2638 {
2639         struct hstate *h = hstate_vma(vma);
2640         struct page *old_page, *new_page;
2641         int outside_reserve = 0;
2642         unsigned long mmun_start;       /* For mmu_notifiers */
2643         unsigned long mmun_end;         /* For mmu_notifiers */
2644
2645         old_page = pte_page(pte);
2646
2647 retry_avoidcopy:
2648         /* If no-one else is actually using this page, avoid the copy
2649          * and just make the page writable */
2650         if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
2651                 page_move_anon_rmap(old_page, vma, address);
2652                 set_huge_ptep_writable(vma, address, ptep);
2653                 return 0;
2654         }
2655
2656         /*
2657          * If the process that created a MAP_PRIVATE mapping is about to
2658          * perform a COW due to a shared page count, attempt to satisfy
2659          * the allocation without using the existing reserves. The pagecache
2660          * page is used to determine if the reserve at this address was
2661          * consumed or not. If reserves were used, a partial faulted mapping
2662          * at the time of fork() could consume its reserves on COW instead
2663          * of the full address range.
2664          */
2665         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
2666                         old_page != pagecache_page)
2667                 outside_reserve = 1;
2668
2669         page_cache_get(old_page);
2670
2671         /* Drop page table lock as buddy allocator may be called */
2672         spin_unlock(ptl);
2673         new_page = alloc_huge_page(vma, address, outside_reserve);
2674
2675         if (IS_ERR(new_page)) {
2676                 long err = PTR_ERR(new_page);
2677                 page_cache_release(old_page);
2678
2679                 /*
2680                  * If a process owning a MAP_PRIVATE mapping fails to COW,
2681                  * it is due to references held by a child and an insufficient
2682                  * huge page pool. To guarantee the original mappers
2683                  * reliability, unmap the page from child processes. The child
2684                  * may get SIGKILLed if it later faults.
2685                  */
2686                 if (outside_reserve) {
2687                         BUG_ON(huge_pte_none(pte));
2688                         if (unmap_ref_private(mm, vma, old_page, address)) {
2689                                 BUG_ON(huge_pte_none(pte));
2690                                 spin_lock(ptl);
2691                                 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2692                                 if (likely(pte_same(huge_ptep_get(ptep), pte)))
2693                                         goto retry_avoidcopy;
2694                                 /*
2695                                  * race occurs while re-acquiring page table
2696                                  * lock, and our job is done.
2697                                  */
2698                                 return 0;
2699                         }
2700                         WARN_ON_ONCE(1);
2701                 }
2702
2703                 /* Caller expects lock to be held */
2704                 spin_lock(ptl);
2705                 if (err == -ENOMEM)
2706                         return VM_FAULT_OOM;
2707                 else
2708                         return VM_FAULT_SIGBUS;
2709         }
2710
2711         /*
2712          * When the original hugepage is shared one, it does not have
2713          * anon_vma prepared.
2714          */
2715         if (unlikely(anon_vma_prepare(vma))) {
2716                 page_cache_release(new_page);
2717                 page_cache_release(old_page);
2718                 /* Caller expects lock to be held */
2719                 spin_lock(ptl);
2720                 return VM_FAULT_OOM;
2721         }
2722
2723         copy_user_huge_page(new_page, old_page, address, vma,
2724                             pages_per_huge_page(h));
2725         __SetPageUptodate(new_page);
2726
2727         mmun_start = address & huge_page_mask(h);
2728         mmun_end = mmun_start + huge_page_size(h);
2729         mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2730         /*
2731          * Retake the page table lock to check for racing updates
2732          * before the page tables are altered
2733          */
2734         spin_lock(ptl);
2735         ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2736         if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2737                 ClearPagePrivate(new_page);
2738
2739                 /* Break COW */
2740                 huge_ptep_clear_flush(vma, address, ptep);
2741                 set_huge_pte_at(mm, address, ptep,
2742                                 make_huge_pte(vma, new_page, 1));
2743                 page_remove_rmap(old_page);
2744                 hugepage_add_new_anon_rmap(new_page, vma, address);
2745                 /* Make the old page be freed below */
2746                 new_page = old_page;
2747         }
2748         spin_unlock(ptl);
2749         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2750         page_cache_release(new_page);
2751         page_cache_release(old_page);
2752
2753         /* Caller expects lock to be held */
2754         spin_lock(ptl);
2755         return 0;
2756 }
2757
2758 /* Return the pagecache page at a given address within a VMA */
2759 static struct page *hugetlbfs_pagecache_page(struct hstate *h,
2760                         struct vm_area_struct *vma, unsigned long address)
2761 {
2762         struct address_space *mapping;
2763         pgoff_t idx;
2764
2765         mapping = vma->vm_file->f_mapping;
2766         idx = vma_hugecache_offset(h, vma, address);
2767
2768         return find_lock_page(mapping, idx);
2769 }
2770
2771 /*
2772  * Return whether there is a pagecache page to back given address within VMA.
2773  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2774  */
2775 static bool hugetlbfs_pagecache_present(struct hstate *h,
2776                         struct vm_area_struct *vma, unsigned long address)
2777 {
2778         struct address_space *mapping;
2779         pgoff_t idx;
2780         struct page *page;
2781
2782         mapping = vma->vm_file->f_mapping;
2783         idx = vma_hugecache_offset(h, vma, address);
2784
2785         page = find_get_page(mapping, idx);
2786         if (page)
2787                 put_page(page);
2788         return page != NULL;
2789 }
2790
2791 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2792                            struct address_space *mapping, pgoff_t idx,
2793                            unsigned long address, pte_t *ptep, unsigned int flags)
2794 {
2795         struct hstate *h = hstate_vma(vma);
2796         int ret = VM_FAULT_SIGBUS;
2797         int anon_rmap = 0;
2798         unsigned long size;
2799         struct page *page;
2800         pte_t new_pte;
2801         spinlock_t *ptl;
2802
2803         /*
2804          * Currently, we are forced to kill the process in the event the
2805          * original mapper has unmapped pages from the child due to a failed
2806          * COW. Warn that such a situation has occurred as it may not be obvious
2807          */
2808         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2809                 pr_warning("PID %d killed due to inadequate hugepage pool\n",
2810                            current->pid);
2811                 return ret;
2812         }
2813
2814         /*
2815          * Use page lock to guard against racing truncation
2816          * before we get page_table_lock.
2817          */
2818 retry:
2819         page = find_lock_page(mapping, idx);
2820         if (!page) {
2821                 size = i_size_read(mapping->host) >> huge_page_shift(h);
2822                 if (idx >= size)
2823                         goto out;
2824                 page = alloc_huge_page(vma, address, 0);
2825                 if (IS_ERR(page)) {
2826                         ret = PTR_ERR(page);
2827                         if (ret == -ENOMEM)
2828                                 ret = VM_FAULT_OOM;
2829                         else
2830                                 ret = VM_FAULT_SIGBUS;
2831                         goto out;
2832                 }
2833                 clear_huge_page(page, address, pages_per_huge_page(h));
2834                 __SetPageUptodate(page);
2835
2836                 if (vma->vm_flags & VM_MAYSHARE) {
2837                         int err;
2838                         struct inode *inode = mapping->host;
2839
2840                         err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
2841                         if (err) {
2842                                 put_page(page);
2843                                 if (err == -EEXIST)
2844                                         goto retry;
2845                                 goto out;
2846                         }
2847                         ClearPagePrivate(page);
2848
2849                         spin_lock(&inode->i_lock);
2850                         inode->i_blocks += blocks_per_huge_page(h);
2851                         spin_unlock(&inode->i_lock);
2852                 } else {
2853                         lock_page(page);
2854                         if (unlikely(anon_vma_prepare(vma))) {
2855                                 ret = VM_FAULT_OOM;
2856                                 goto backout_unlocked;
2857                         }
2858                         anon_rmap = 1;
2859                 }
2860         } else {
2861                 /*
2862                  * If memory error occurs between mmap() and fault, some process
2863                  * don't have hwpoisoned swap entry for errored virtual address.
2864                  * So we need to block hugepage fault by PG_hwpoison bit check.
2865                  */
2866                 if (unlikely(PageHWPoison(page))) {
2867                         ret = VM_FAULT_HWPOISON |
2868                                 VM_FAULT_SET_HINDEX(hstate_index(h));
2869                         goto backout_unlocked;
2870                 }
2871         }
2872
2873         /*
2874          * If we are going to COW a private mapping later, we examine the
2875          * pending reservations for this page now. This will ensure that
2876          * any allocations necessary to record that reservation occur outside
2877          * the spinlock.
2878          */
2879         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
2880                 if (vma_needs_reservation(h, vma, address) < 0) {
2881                         ret = VM_FAULT_OOM;
2882                         goto backout_unlocked;
2883                 }
2884
2885         ptl = huge_pte_lockptr(h, mm, ptep);
2886         spin_lock(ptl);
2887         size = i_size_read(mapping->host) >> huge_page_shift(h);
2888         if (idx >= size)
2889                 goto backout;
2890
2891         ret = 0;
2892         if (!huge_pte_none(huge_ptep_get(ptep)))
2893                 goto backout;
2894
2895         if (anon_rmap) {
2896                 ClearPagePrivate(page);
2897                 hugepage_add_new_anon_rmap(page, vma, address);
2898         }
2899         else
2900                 page_dup_rmap(page);
2901         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
2902                                 && (vma->vm_flags & VM_SHARED)));
2903         set_huge_pte_at(mm, address, ptep, new_pte);
2904
2905         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
2906                 /* Optimization, do the COW without a second fault */
2907                 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
2908         }
2909
2910         spin_unlock(ptl);
2911         unlock_page(page);
2912 out:
2913         return ret;
2914
2915 backout:
2916         spin_unlock(ptl);
2917 backout_unlocked:
2918         unlock_page(page);
2919         put_page(page);
2920         goto out;
2921 }
2922
2923 #ifdef CONFIG_SMP
2924 static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
2925                             struct vm_area_struct *vma,
2926                             struct address_space *mapping,
2927                             pgoff_t idx, unsigned long address)
2928 {
2929         unsigned long key[2];
2930         u32 hash;
2931
2932         if (vma->vm_flags & VM_SHARED) {
2933                 key[0] = (unsigned long) mapping;
2934                 key[1] = idx;
2935         } else {
2936                 key[0] = (unsigned long) mm;
2937                 key[1] = address >> huge_page_shift(h);
2938         }
2939
2940         hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
2941
2942         return hash & (num_fault_mutexes - 1);
2943 }
2944 #else
2945 /*
2946  * For uniprocesor systems we always use a single mutex, so just
2947  * return 0 and avoid the hashing overhead.
2948  */
2949 static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
2950                             struct vm_area_struct *vma,
2951                             struct address_space *mapping,
2952                             pgoff_t idx, unsigned long address)
2953 {
2954         return 0;
2955 }
2956 #endif
2957
2958 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2959                         unsigned long address, unsigned int flags)
2960 {
2961         pte_t *ptep, entry;
2962         spinlock_t *ptl;
2963         int ret;
2964         u32 hash;
2965         pgoff_t idx;
2966         struct page *page = NULL;
2967         struct page *pagecache_page = NULL;
2968         struct hstate *h = hstate_vma(vma);
2969         struct address_space *mapping;
2970
2971         address &= huge_page_mask(h);
2972
2973         ptep = huge_pte_offset(mm, address);
2974         if (ptep) {
2975                 entry = huge_ptep_get(ptep);
2976                 if (unlikely(is_hugetlb_entry_migration(entry))) {
2977                         migration_entry_wait_huge(vma, mm, ptep);
2978                         return 0;
2979                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2980                         return VM_FAULT_HWPOISON_LARGE |
2981                                 VM_FAULT_SET_HINDEX(hstate_index(h));
2982         }
2983
2984         ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2985         if (!ptep)
2986                 return VM_FAULT_OOM;
2987
2988         mapping = vma->vm_file->f_mapping;
2989         idx = vma_hugecache_offset(h, vma, address);
2990
2991         /*
2992          * Serialize hugepage allocation and instantiation, so that we don't
2993          * get spurious allocation failures if two CPUs race to instantiate
2994          * the same page in the page cache.
2995          */
2996         hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
2997         mutex_lock(&htlb_fault_mutex_table[hash]);
2998
2999         entry = huge_ptep_get(ptep);
3000         if (huge_pte_none(entry)) {
3001                 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
3002                 goto out_mutex;
3003         }
3004
3005         ret = 0;
3006
3007         /*
3008          * If we are going to COW the mapping later, we examine the pending
3009          * reservations for this page now. This will ensure that any
3010          * allocations necessary to record that reservation occur outside the
3011          * spinlock. For private mappings, we also lookup the pagecache
3012          * page now as it is used to determine if a reservation has been
3013          * consumed.
3014          */
3015         if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
3016                 if (vma_needs_reservation(h, vma, address) < 0) {
3017                         ret = VM_FAULT_OOM;
3018                         goto out_mutex;
3019                 }
3020
3021                 if (!(vma->vm_flags & VM_MAYSHARE))
3022                         pagecache_page = hugetlbfs_pagecache_page(h,
3023                                                                 vma, address);
3024         }
3025
3026         /*
3027          * hugetlb_cow() requires page locks of pte_page(entry) and
3028          * pagecache_page, so here we need take the former one
3029          * when page != pagecache_page or !pagecache_page.
3030          * Note that locking order is always pagecache_page -> page,
3031          * so no worry about deadlock.
3032          */
3033         page = pte_page(entry);
3034         get_page(page);
3035         if (page != pagecache_page)
3036                 lock_page(page);
3037
3038         ptl = huge_pte_lockptr(h, mm, ptep);
3039         spin_lock(ptl);
3040         /* Check for a racing update before calling hugetlb_cow */
3041         if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3042                 goto out_ptl;
3043
3044
3045         if (flags & FAULT_FLAG_WRITE) {
3046                 if (!huge_pte_write(entry)) {
3047                         ret = hugetlb_cow(mm, vma, address, ptep, entry,
3048                                         pagecache_page, ptl);
3049                         goto out_ptl;
3050                 }
3051                 entry = huge_pte_mkdirty(entry);
3052         }
3053         entry = pte_mkyoung(entry);
3054         if (huge_ptep_set_access_flags(vma, address, ptep, entry,
3055                                                 flags & FAULT_FLAG_WRITE))
3056                 update_mmu_cache(vma, address, ptep);
3057
3058 out_ptl:
3059         spin_unlock(ptl);
3060
3061         if (pagecache_page) {
3062                 unlock_page(pagecache_page);
3063                 put_page(pagecache_page);
3064         }
3065         if (page != pagecache_page)
3066                 unlock_page(page);
3067         put_page(page);
3068
3069 out_mutex:
3070         mutex_unlock(&htlb_fault_mutex_table[hash]);
3071         return ret;
3072 }
3073
3074 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3075                          struct page **pages, struct vm_area_struct **vmas,
3076                          unsigned long *position, unsigned long *nr_pages,
3077                          long i, unsigned int flags)
3078 {
3079         unsigned long pfn_offset;
3080         unsigned long vaddr = *position;
3081         unsigned long remainder = *nr_pages;
3082         struct hstate *h = hstate_vma(vma);
3083
3084         while (vaddr < vma->vm_end && remainder) {
3085                 pte_t *pte;
3086                 spinlock_t *ptl = NULL;
3087                 int absent;
3088                 struct page *page;
3089
3090                 /*
3091                  * Some archs (sparc64, sh*) have multiple pte_ts to
3092                  * each hugepage.  We have to make sure we get the
3093                  * first, for the page indexing below to work.
3094                  *
3095                  * Note that page table lock is not held when pte is null.
3096                  */
3097                 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
3098                 if (pte)
3099                         ptl = huge_pte_lock(h, mm, pte);
3100                 absent = !pte || huge_pte_none(huge_ptep_get(pte));
3101
3102                 /*
3103                  * When coredumping, it suits get_dump_page if we just return
3104                  * an error where there's an empty slot with no huge pagecache
3105                  * to back it.  This way, we avoid allocating a hugepage, and
3106                  * the sparse dumpfile avoids allocating disk blocks, but its
3107                  * huge holes still show up with zeroes where they need to be.
3108                  */
3109                 if (absent && (flags & FOLL_DUMP) &&
3110                     !hugetlbfs_pagecache_present(h, vma, vaddr)) {
3111                         if (pte)
3112                                 spin_unlock(ptl);
3113                         remainder = 0;
3114                         break;
3115                 }
3116
3117                 /*
3118                  * We need call hugetlb_fault for both hugepages under migration
3119                  * (in which case hugetlb_fault waits for the migration,) and
3120                  * hwpoisoned hugepages (in which case we need to prevent the
3121                  * caller from accessing to them.) In order to do this, we use
3122                  * here is_swap_pte instead of is_hugetlb_entry_migration and
3123                  * is_hugetlb_entry_hwpoisoned. This is because it simply covers
3124                  * both cases, and because we can't follow correct pages
3125                  * directly from any kind of swap entries.
3126                  */
3127                 if (absent || is_swap_pte(huge_ptep_get(pte)) ||
3128                     ((flags & FOLL_WRITE) &&
3129                       !huge_pte_write(huge_ptep_get(pte)))) {
3130                         int ret;
3131
3132                         if (pte)
3133                                 spin_unlock(ptl);
3134                         ret = hugetlb_fault(mm, vma, vaddr,
3135                                 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
3136                         if (!(ret & VM_FAULT_ERROR))
3137                                 continue;
3138
3139                         remainder = 0;
3140                         break;
3141                 }
3142
3143                 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
3144                 page = pte_page(huge_ptep_get(pte));
3145 same_page:
3146                 if (pages) {
3147                         pages[i] = mem_map_offset(page, pfn_offset);
3148                         get_page_foll(pages[i]);
3149                 }
3150
3151                 if (vmas)
3152                         vmas[i] = vma;
3153
3154                 vaddr += PAGE_SIZE;
3155                 ++pfn_offset;
3156                 --remainder;
3157                 ++i;
3158                 if (vaddr < vma->vm_end && remainder &&
3159                                 pfn_offset < pages_per_huge_page(h)) {
3160                         /*
3161                          * We use pfn_offset to avoid touching the pageframes
3162                          * of this compound page.
3163                          */
3164                         goto same_page;
3165                 }
3166                 spin_unlock(ptl);
3167         }
3168         *nr_pages = remainder;
3169         *position = vaddr;
3170
3171         return i ? i : -EFAULT;
3172 }
3173
3174 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3175                 unsigned long address, unsigned long end, pgprot_t newprot)
3176 {
3177         struct mm_struct *mm = vma->vm_mm;
3178         unsigned long start = address;
3179         pte_t *ptep;
3180         pte_t pte;
3181         struct hstate *h = hstate_vma(vma);
3182         unsigned long pages = 0;
3183
3184         BUG_ON(address >= end);
3185         flush_cache_range(vma, address, end);
3186
3187         mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
3188         for (; address < end; address += huge_page_size(h)) {
3189                 spinlock_t *ptl;
3190                 ptep = huge_pte_offset(mm, address);
3191                 if (!ptep)
3192                         continue;
3193                 ptl = huge_pte_lock(h, mm, ptep);
3194                 if (huge_pmd_unshare(mm, &address, ptep)) {
3195                         pages++;
3196                         spin_unlock(ptl);
3197                         continue;
3198                 }
3199                 if (!huge_pte_none(huge_ptep_get(ptep))) {
3200                         pte = huge_ptep_get_and_clear(mm, address, ptep);
3201                         pte = pte_mkhuge(huge_pte_modify(pte, newprot));
3202                         pte = arch_make_huge_pte(pte, vma, NULL, 0);
3203                         set_huge_pte_at(mm, address, ptep, pte);
3204                         pages++;
3205                 }
3206                 spin_unlock(ptl);
3207         }
3208         /*
3209          * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
3210          * may have cleared our pud entry and done put_page on the page table:
3211          * once we release i_mmap_mutex, another task can do the final put_page
3212          * and that page table be reused and filled with junk.
3213          */
3214         flush_tlb_range(vma, start, end);
3215         mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
3216
3217         return pages << h->order;
3218 }
3219
3220 int hugetlb_reserve_pages(struct inode *inode,
3221                                         long from, long to,
3222                                         struct vm_area_struct *vma,
3223                                         vm_flags_t vm_flags)
3224 {
3225         long ret, chg;
3226         struct hstate *h = hstate_inode(inode);
3227         struct hugepage_subpool *spool = subpool_inode(inode);
3228         struct resv_map *resv_map;
3229
3230         /*
3231          * Only apply hugepage reservation if asked. At fault time, an
3232          * attempt will be made for VM_NORESERVE to allocate a page
3233          * without using reserves
3234          */
3235         if (vm_flags & VM_NORESERVE)
3236                 return 0;
3237
3238         /*
3239          * Shared mappings base their reservation on the number of pages that
3240          * are already allocated on behalf of the file. Private mappings need
3241          * to reserve the full area even if read-only as mprotect() may be
3242          * called to make the mapping read-write. Assume !vma is a shm mapping
3243          */
3244         if (!vma || vma->vm_flags & VM_MAYSHARE) {
3245                 resv_map = inode_resv_map(inode);
3246
3247                 chg = region_chg(resv_map, from, to);
3248
3249         } else {
3250                 resv_map = resv_map_alloc();
3251                 if (!resv_map)
3252                         return -ENOMEM;
3253
3254                 chg = to - from;
3255
3256                 set_vma_resv_map(vma, resv_map);
3257                 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
3258         }
3259
3260         if (chg < 0) {
3261                 ret = chg;
3262                 goto out_err;
3263         }
3264
3265         /* There must be enough pages in the subpool for the mapping */
3266         if (hugepage_subpool_get_pages(spool, chg)) {
3267                 ret = -ENOSPC;
3268                 goto out_err;
3269         }
3270
3271         /*
3272          * Check enough hugepages are available for the reservation.
3273          * Hand the pages back to the subpool if there are not
3274          */
3275         ret = hugetlb_acct_memory(h, chg);
3276         if (ret < 0) {
3277                 hugepage_subpool_put_pages(spool, chg);
3278                 goto out_err;
3279         }
3280
3281         /*
3282          * Account for the reservations made. Shared mappings record regions
3283          * that have reservations as they are shared by multiple VMAs.
3284          * When the last VMA disappears, the region map says how much
3285          * the reservation was and the page cache tells how much of
3286          * the reservation was consumed. Private mappings are per-VMA and
3287          * only the consumed reservations are tracked. When the VMA
3288          * disappears, the original reservation is the VMA size and the
3289          * consumed reservations are stored in the map. Hence, nothing
3290          * else has to be done for private mappings here
3291          */
3292         if (!vma || vma->vm_flags & VM_MAYSHARE)
3293                 region_add(resv_map, from, to);
3294         return 0;
3295 out_err:
3296         if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3297                 kref_put(&resv_map->refs, resv_map_release);
3298         return ret;
3299 }
3300
3301 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3302 {
3303         struct hstate *h = hstate_inode(inode);
3304         struct resv_map *resv_map = inode_resv_map(inode);
3305         long chg = 0;
3306         struct hugepage_subpool *spool = subpool_inode(inode);
3307
3308         if (resv_map)
3309                 chg = region_truncate(resv_map, offset);
3310         spin_lock(&inode->i_lock);
3311         inode->i_blocks -= (blocks_per_huge_page(h) * freed);
3312         spin_unlock(&inode->i_lock);
3313
3314         hugepage_subpool_put_pages(spool, (chg - freed));
3315         hugetlb_acct_memory(h, -(chg - freed));
3316 }
3317
3318 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
3319 static unsigned long page_table_shareable(struct vm_area_struct *svma,
3320                                 struct vm_area_struct *vma,
3321                                 unsigned long addr, pgoff_t idx)
3322 {
3323         unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
3324                                 svma->vm_start;
3325         unsigned long sbase = saddr & PUD_MASK;
3326         unsigned long s_end = sbase + PUD_SIZE;
3327
3328         /* Allow segments to share if only one is marked locked */
3329         unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
3330         unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
3331
3332         /*
3333          * match the virtual addresses, permission and the alignment of the
3334          * page table page.
3335          */
3336         if (pmd_index(addr) != pmd_index(saddr) ||
3337             vm_flags != svm_flags ||
3338             sbase < svma->vm_start || svma->vm_end < s_end)
3339                 return 0;
3340
3341         return saddr;
3342 }
3343
3344 static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
3345 {
3346         unsigned long base = addr & PUD_MASK;
3347         unsigned long end = base + PUD_SIZE;
3348
3349         /*
3350          * check on proper vm_flags and page table alignment
3351          */
3352         if (vma->vm_flags & VM_MAYSHARE &&
3353             vma->vm_start <= base && end <= vma->vm_end)
3354                 return 1;
3355         return 0;
3356 }
3357
3358 /*
3359  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
3360  * and returns the corresponding pte. While this is not necessary for the
3361  * !shared pmd case because we can allocate the pmd later as well, it makes the
3362  * code much cleaner. pmd allocation is essential for the shared case because
3363  * pud has to be populated inside the same i_mmap_mutex section - otherwise
3364  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
3365  * bad pmd for sharing.
3366  */
3367 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3368 {
3369         struct vm_area_struct *vma = find_vma(mm, addr);
3370         struct address_space *mapping = vma->vm_file->f_mapping;
3371         pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
3372                         vma->vm_pgoff;
3373         struct vm_area_struct *svma;
3374         unsigned long saddr;
3375         pte_t *spte = NULL;
3376         pte_t *pte;
3377         spinlock_t *ptl;
3378
3379         if (!vma_shareable(vma, addr))
3380                 return (pte_t *)pmd_alloc(mm, pud, addr);
3381
3382         mutex_lock(&mapping->i_mmap_mutex);
3383         vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
3384                 if (svma == vma)
3385                         continue;
3386
3387                 saddr = page_table_shareable(svma, vma, addr, idx);
3388                 if (saddr) {
3389                         spte = huge_pte_offset(svma->vm_mm, saddr);
3390                         if (spte) {
3391                                 get_page(virt_to_page(spte));
3392                                 break;
3393                         }
3394                 }
3395         }
3396
3397         if (!spte)
3398                 goto out;
3399
3400         ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
3401         spin_lock(ptl);
3402         if (pud_none(*pud))
3403                 pud_populate(mm, pud,
3404                                 (pmd_t *)((unsigned long)spte & PAGE_MASK));
3405         else
3406                 put_page(virt_to_page(spte));
3407         spin_unlock(ptl);
3408 out:
3409         pte = (pte_t *)pmd_alloc(mm, pud, addr);
3410         mutex_unlock(&mapping->i_mmap_mutex);
3411         return pte;
3412 }
3413
3414 /*
3415  * unmap huge page backed by shared pte.
3416  *
3417  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
3418  * indicated by page_count > 1, unmap is achieved by clearing pud and
3419  * decrementing the ref count. If count == 1, the pte page is not shared.
3420  *
3421  * called with page table lock held.
3422  *
3423  * returns: 1 successfully unmapped a shared pte page
3424  *          0 the underlying pte page is not shared, or it is the last user
3425  */
3426 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
3427 {
3428         pgd_t *pgd = pgd_offset(mm, *addr);
3429         pud_t *pud = pud_offset(pgd, *addr);
3430
3431         BUG_ON(page_count(virt_to_page(ptep)) == 0);
3432         if (page_count(virt_to_page(ptep)) == 1)
3433                 return 0;
3434
3435         pud_clear(pud);
3436         put_page(virt_to_page(ptep));
3437         *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
3438         return 1;
3439 }
3440 #define want_pmd_share()        (1)
3441 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
3442 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3443 {
3444         return NULL;
3445 }
3446 #define want_pmd_share()        (0)
3447 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
3448
3449 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
3450 pte_t *huge_pte_alloc(struct mm_struct *mm,
3451                         unsigned long addr, unsigned long sz)
3452 {
3453         pgd_t *pgd;
3454         pud_t *pud;
3455         pte_t *pte = NULL;
3456
3457         pgd = pgd_offset(mm, addr);
3458         pud = pud_alloc(mm, pgd, addr);
3459         if (pud) {
3460                 if (sz == PUD_SIZE) {
3461                         pte = (pte_t *)pud;
3462                 } else {
3463                         BUG_ON(sz != PMD_SIZE);
3464                         if (want_pmd_share() && pud_none(*pud))
3465                                 pte = huge_pmd_share(mm, addr, pud);
3466                         else
3467                                 pte = (pte_t *)pmd_alloc(mm, pud, addr);
3468                 }
3469         }
3470         BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
3471
3472         return pte;
3473 }
3474
3475 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
3476 {
3477         pgd_t *pgd;
3478         pud_t *pud;
3479         pmd_t *pmd = NULL;
3480
3481         pgd = pgd_offset(mm, addr);
3482         if (pgd_present(*pgd)) {
3483                 pud = pud_offset(pgd, addr);
3484                 if (pud_present(*pud)) {
3485                         if (pud_huge(*pud))
3486                                 return (pte_t *)pud;
3487                         pmd = pmd_offset(pud, addr);
3488                 }
3489         }
3490         return (pte_t *) pmd;
3491 }
3492
3493 struct page *
3494 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3495                 pmd_t *pmd, int write)
3496 {
3497         struct page *page;
3498
3499         page = pte_page(*(pte_t *)pmd);
3500         if (page)
3501                 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
3502         return page;
3503 }
3504
3505 struct page *
3506 follow_huge_pud(struct mm_struct *mm, unsigned long address,
3507                 pud_t *pud, int write)
3508 {
3509         struct page *page;
3510
3511         page = pte_page(*(pte_t *)pud);
3512         if (page)
3513                 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
3514         return page;
3515 }
3516
3517 #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3518
3519 /* Can be overriden by architectures */
3520 __attribute__((weak)) struct page *
3521 follow_huge_pud(struct mm_struct *mm, unsigned long address,
3522                pud_t *pud, int write)
3523 {
3524         BUG();
3525         return NULL;
3526 }
3527
3528 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3529
3530 #ifdef CONFIG_MEMORY_FAILURE
3531
3532 /* Should be called in hugetlb_lock */
3533 static int is_hugepage_on_freelist(struct page *hpage)
3534 {
3535         struct page *page;
3536         struct page *tmp;
3537         struct hstate *h = page_hstate(hpage);
3538         int nid = page_to_nid(hpage);
3539
3540         list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
3541                 if (page == hpage)
3542                         return 1;
3543         return 0;
3544 }
3545
3546 /*
3547  * This function is called from memory failure code.
3548  * Assume the caller holds page lock of the head page.
3549  */
3550 int dequeue_hwpoisoned_huge_page(struct page *hpage)
3551 {
3552         struct hstate *h = page_hstate(hpage);
3553         int nid = page_to_nid(hpage);
3554         int ret = -EBUSY;
3555
3556         spin_lock(&hugetlb_lock);
3557         if (is_hugepage_on_freelist(hpage)) {
3558                 /*
3559                  * Hwpoisoned hugepage isn't linked to activelist or freelist,
3560                  * but dangling hpage->lru can trigger list-debug warnings
3561                  * (this happens when we call unpoison_memory() on it),
3562                  * so let it point to itself with list_del_init().
3563                  */
3564                 list_del_init(&hpage->lru);
3565                 set_page_refcounted(hpage);
3566                 h->free_huge_pages--;
3567                 h->free_huge_pages_node[nid]--;
3568                 ret = 0;
3569         }
3570         spin_unlock(&hugetlb_lock);
3571         return ret;
3572 }
3573 #endif
3574
3575 bool isolate_huge_page(struct page *page, struct list_head *list)
3576 {
3577         VM_BUG_ON_PAGE(!PageHead(page), page);
3578         if (!get_page_unless_zero(page))
3579                 return false;
3580         spin_lock(&hugetlb_lock);
3581         list_move_tail(&page->lru, list);
3582         spin_unlock(&hugetlb_lock);
3583         return true;
3584 }
3585
3586 void putback_active_hugepage(struct page *page)
3587 {
3588         VM_BUG_ON_PAGE(!PageHead(page), page);
3589         spin_lock(&hugetlb_lock);
3590         list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3591         spin_unlock(&hugetlb_lock);
3592         put_page(page);
3593 }
3594
3595 bool is_hugepage_active(struct page *page)
3596 {
3597         VM_BUG_ON_PAGE(!PageHuge(page), page);
3598         /*
3599          * This function can be called for a tail page because the caller,
3600          * scan_movable_pages, scans through a given pfn-range which typically
3601          * covers one memory block. In systems using gigantic hugepage (1GB
3602          * for x86_64,) a hugepage is larger than a memory block, and we don't
3603          * support migrating such large hugepages for now, so return false
3604          * when called for tail pages.
3605          */
3606         if (PageTail(page))
3607                 return false;
3608         /*
3609          * Refcount of a hwpoisoned hugepages is 1, but they are not active,
3610          * so we should return false for them.
3611          */
3612         if (unlikely(PageHWPoison(page)))
3613                 return false;
3614         return page_count(page) > 0;
3615 }