]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - mm/hugetlb.c
mm/memory-hotplug: switch locking to a percpu rwsem
[karo-tx-linux.git] / mm / hugetlb.c
index 1a88006ec6343ac8117f3c6eead05453a36205e2..1e516520433dc72437819267ec1be5a1770b1dd6 100644 (file)
@@ -20,9 +20,9 @@
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
 #include <linux/rmap.h>
+#include <linux/string_helpers.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
-#include <linux/page-isolation.h>
 #include <linux/jhash.h>
 
 #include <asm/page.h>
@@ -872,7 +872,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
        struct page *page;
 
        list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
-               if (!is_migrate_isolate_page(page))
+               if (!PageHWPoison(page))
                        break;
        /*
         * if 'non-isolated free hugepage' not found on the list,
@@ -887,19 +887,39 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
        return page;
 }
 
-static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
+               nodemask_t *nmask)
 {
-       struct page *page;
-       int node;
+       unsigned int cpuset_mems_cookie;
+       struct zonelist *zonelist;
+       struct zone *zone;
+       struct zoneref *z;
+       int node = -1;
 
-       if (nid != NUMA_NO_NODE)
-               return dequeue_huge_page_node_exact(h, nid);
+       zonelist = node_zonelist(nid, gfp_mask);
+
+retry_cpuset:
+       cpuset_mems_cookie = read_mems_allowed_begin();
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
+               struct page *page;
+
+               if (!cpuset_zone_allowed(zone, gfp_mask))
+                       continue;
+               /*
+                * no need to ask again on the same node. Pool is node rather than
+                * zone aware
+                */
+               if (zone_to_nid(zone) == node)
+                       continue;
+               node = zone_to_nid(zone);
 
-       for_each_online_node(node) {
                page = dequeue_huge_page_node_exact(h, node);
                if (page)
                        return page;
        }
+       if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
+               goto retry_cpuset;
+
        return NULL;
 }
 
@@ -917,15 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                                unsigned long address, int avoid_reserve,
                                long chg)
 {
-       struct page *page = NULL;
+       struct page *page;
        struct mempolicy *mpol;
-       nodemask_t *nodemask;
        gfp_t gfp_mask;
+       nodemask_t *nodemask;
        int nid;
-       struct zonelist *zonelist;
-       struct zone *zone;
-       struct zoneref *z;
-       unsigned int cpuset_mems_cookie;
 
        /*
         * A child process with MAP_PRIVATE mappings created by their parent
@@ -940,32 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
                goto err;
 
-retry_cpuset:
-       cpuset_mems_cookie = read_mems_allowed_begin();
        gfp_mask = htlb_alloc_mask(h);
        nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
-       zonelist = node_zonelist(nid, gfp_mask);
-
-       for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                                               MAX_NR_ZONES - 1, nodemask) {
-               if (cpuset_zone_allowed(zone, gfp_mask)) {
-                       page = dequeue_huge_page_node(h, zone_to_nid(zone));
-                       if (page) {
-                               if (avoid_reserve)
-                                       break;
-                               if (!vma_has_reserves(vma, chg))
-                                       break;
-
-                               SetPagePrivate(page);
-                               h->resv_huge_pages--;
-                               break;
-                       }
-               }
+       page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+       if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
+               SetPagePrivate(page);
+               h->resv_huge_pages--;
        }
 
        mpol_cond_put(mpol);
-       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
-               goto retry_cpuset;
        return page;
 
 err:
@@ -1460,7 +1459,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
  * number of free hugepages would be reduced below the number of reserved
  * hugepages.
  */
-static int dissolve_free_huge_page(struct page *page)
+int dissolve_free_huge_page(struct page *page)
 {
        int rc = 0;
 
@@ -1473,6 +1472,14 @@ static int dissolve_free_huge_page(struct page *page)
                        rc = -EBUSY;
                        goto out;
                }
+               /*
+                * Move PageHWPoison flag from head page to the raw error page,
+                * which makes any subpages rather than the error page reusable.
+                */
+               if (PageHWPoison(head) && page != head) {
+                       SetPageHWPoison(page);
+                       ClearPageHWPoison(head);
+               }
                list_del(&head->lru);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
@@ -1513,82 +1520,19 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
        return rc;
 }
 
-/*
- * There are 3 ways this can get called:
- * 1. With vma+addr: we use the VMA's memory policy
- * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
- *    page from any node, and let the buddy allocator itself figure
- *    it out.
- * 3. With !vma, but nid!=NUMA_NO_NODE.  We allocate a huge page
- *    strictly from 'nid'
- */
 static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
-               struct vm_area_struct *vma, unsigned long addr, int nid)
+               gfp_t gfp_mask, int nid, nodemask_t *nmask)
 {
        int order = huge_page_order(h);
-       gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
-       unsigned int cpuset_mems_cookie;
-
-       /*
-        * We need a VMA to get a memory policy.  If we do not
-        * have one, we use the 'nid' argument.
-        *
-        * The mempolicy stuff below has some non-inlined bits
-        * and calls ->vm_ops.  That makes it hard to optimize at
-        * compile-time, even when NUMA is off and it does
-        * nothing.  This helps the compiler optimize it out.
-        */
-       if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
-               /*
-                * If a specific node is requested, make sure to
-                * get memory from there, but only when a node
-                * is explicitly specified.
-                */
-               if (nid != NUMA_NO_NODE)
-                       gfp |= __GFP_THISNODE;
-               /*
-                * Make sure to call something that can handle
-                * nid=NUMA_NO_NODE
-                */
-               return alloc_pages_node(nid, gfp, order);
-       }
-
-       /*
-        * OK, so we have a VMA.  Fetch the mempolicy and try to
-        * allocate a huge page with it.  We will only reach this
-        * when CONFIG_NUMA=y.
-        */
-       do {
-               struct page *page;
-               struct mempolicy *mpol;
-               int nid;
-               nodemask_t *nodemask;
-
-               cpuset_mems_cookie = read_mems_allowed_begin();
-               nid = huge_node(vma, addr, gfp, &mpol, &nodemask);
-               mpol_cond_put(mpol);
-               page = __alloc_pages_nodemask(gfp, order, nid, nodemask);
-               if (page)
-                       return page;
-       } while (read_mems_allowed_retry(cpuset_mems_cookie));
 
-       return NULL;
+       gfp_mask |= __GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
+       if (nid == NUMA_NO_NODE)
+               nid = numa_mem_id();
+       return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
 }
 
-/*
- * There are two ways to allocate a huge page:
- * 1. When you have a VMA and an address (like a fault)
- * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
- *
- * 'vma' and 'addr' are only for (1).  'nid' is always NUMA_NO_NODE in
- * this case which signifies that the allocation should be done with
- * respect for the VMA's memory policy.
- *
- * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
- * implies that memory policies will not be taken in to account.
- */
-static struct page *__alloc_buddy_huge_page(struct hstate *h,
-               struct vm_area_struct *vma, unsigned long addr, int nid)
+static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nmask)
 {
        struct page *page;
        unsigned int r_nid;
@@ -1596,15 +1540,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
        if (hstate_is_gigantic(h))
                return NULL;
 
-       /*
-        * Make sure that anyone specifying 'nid' is not also specifying a VMA.
-        * This makes sure the caller is picking _one_ of the modes with which
-        * we can call this function, not both.
-        */
-       if (vma || (addr != -1)) {
-               VM_WARN_ON_ONCE(addr == -1);
-               VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
-       }
        /*
         * Assume we will successfully allocate the surplus page to
         * prevent racing processes from causing the surplus to exceed
@@ -1638,7 +1573,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
        }
        spin_unlock(&hugetlb_lock);
 
-       page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
+       page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
 
        spin_lock(&hugetlb_lock);
        if (page) {
@@ -1662,19 +1597,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
        return page;
 }
 
-/*
- * Allocate a huge page from 'nid'.  Note, 'nid' may be
- * NUMA_NO_NODE, which means that it may be allocated
- * anywhere.
- */
-static
-struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
-{
-       unsigned long addr = -1;
-
-       return __alloc_buddy_huge_page(h, NULL, addr, nid);
-}
-
 /*
  * Use the VMA's mpolicy to allocate a huge page from the buddy.
  */
@@ -1682,7 +1604,17 @@ static
 struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
                struct vm_area_struct *vma, unsigned long addr)
 {
-       return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+       struct page *page;
+       struct mempolicy *mpol;
+       gfp_t gfp_mask = htlb_alloc_mask(h);
+       int nid;
+       nodemask_t *nodemask;
+
+       nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
+       page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+       mpol_cond_put(mpol);
+
+       return page;
 }
 
 /*
@@ -1692,19 +1624,46 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
  */
 struct page *alloc_huge_page_node(struct hstate *h, int nid)
 {
+       gfp_t gfp_mask = htlb_alloc_mask(h);
        struct page *page = NULL;
 
+       if (nid != NUMA_NO_NODE)
+               gfp_mask |= __GFP_THISNODE;
+
        spin_lock(&hugetlb_lock);
        if (h->free_huge_pages - h->resv_huge_pages > 0)
-               page = dequeue_huge_page_node(h, nid);
+               page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
        spin_unlock(&hugetlb_lock);
 
        if (!page)
-               page = __alloc_buddy_huge_page_no_mpol(h, nid);
+               page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
 
        return page;
 }
 
+
+struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+               nodemask_t *nmask)
+{
+       gfp_t gfp_mask = htlb_alloc_mask(h);
+
+       spin_lock(&hugetlb_lock);
+       if (h->free_huge_pages - h->resv_huge_pages > 0) {
+               struct page *page;
+
+               page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
+               if (page) {
+                       spin_unlock(&hugetlb_lock);
+                       return page;
+               }
+       }
+       spin_unlock(&hugetlb_lock);
+
+       /* No reservations, try to overcommit */
+
+       return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
 /*
  * Increase the hugetlb pool such that it can accommodate a reservation
  * of size 'delta'.
@@ -1730,12 +1689,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-               page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
+               page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+                               NUMA_NO_NODE, NULL);
                if (!page) {
                        alloc_ok = false;
                        break;
                }
                list_add(&page->lru, &surplus_list);
+               cond_resched();
        }
        allocated += i;
 
@@ -2204,8 +2165,16 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
                } else if (!alloc_fresh_huge_page(h,
                                         &node_states[N_MEMORY]))
                        break;
+               cond_resched();
+       }
+       if (i < h->max_huge_pages) {
+               char buf[32];
+
+               string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+               pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
+                       h->max_huge_pages, buf, i);
+               h->max_huge_pages = i;
        }
-       h->max_huge_pages = i;
 }
 
 static void __init hugetlb_init_hstates(void)
@@ -2223,26 +2192,16 @@ static void __init hugetlb_init_hstates(void)
        VM_BUG_ON(minimum_order == UINT_MAX);
 }
 
-static char * __init memfmt(char *buf, unsigned long n)
-{
-       if (n >= (1UL << 30))
-               sprintf(buf, "%lu GB", n >> 30);
-       else if (n >= (1UL << 20))
-               sprintf(buf, "%lu MB", n >> 20);
-       else
-               sprintf(buf, "%lu KB", n >> 10);
-       return buf;
-}
-
 static void __init report_hugepages(void)
 {
        struct hstate *h;
 
        for_each_hstate(h) {
                char buf[32];
+
+               string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
                pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
-                       memfmt(buf, huge_page_size(h)),
-                       h->free_huge_pages);
+                       buf, h->free_huge_pages);
        }
 }
 
@@ -2801,6 +2760,11 @@ static int __init hugetlb_init(void)
                return 0;
 
        if (!size_to_hstate(default_hstate_size)) {
+               if (default_hstate_size != 0) {
+                       pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
+                              default_hstate_size, HPAGE_SIZE);
+               }
+
                default_hstate_size = HPAGE_SIZE;
                if (!size_to_hstate(default_hstate_size))
                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
@@ -4739,40 +4703,6 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
        return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
 }
 
-#ifdef CONFIG_MEMORY_FAILURE
-
-/*
- * This function is called from memory failure code.
- */
-int dequeue_hwpoisoned_huge_page(struct page *hpage)
-{
-       struct hstate *h = page_hstate(hpage);
-       int nid = page_to_nid(hpage);
-       int ret = -EBUSY;
-
-       spin_lock(&hugetlb_lock);
-       /*
-        * Just checking !page_huge_active is not enough, because that could be
-        * an isolated/hwpoisoned hugepage (which have >0 refcount).
-        */
-       if (!page_huge_active(hpage) && !page_count(hpage)) {
-               /*
-                * Hwpoisoned hugepage isn't linked to activelist or freelist,
-                * but dangling hpage->lru can trigger list-debug warnings
-                * (this happens when we call unpoison_memory() on it),
-                * so let it point to itself with list_del_init().
-                */
-               list_del_init(&hpage->lru);
-               set_page_refcounted(hpage);
-               h->free_huge_pages--;
-               h->free_huge_pages_node[nid]--;
-               ret = 0;
-       }
-       spin_unlock(&hugetlb_lock);
-       return ret;
-}
-#endif
-
 bool isolate_huge_page(struct page *page, struct list_head *list)
 {
        bool ret = true;