mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70 #include <linux/mempolicy.h>
  71 #include <linux/mm.h>
  72 #include <linux/highmem.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/kernel.h>
  75 #include <linux/sched.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/export.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/ksm.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92 #include <linux/syscalls.h>
  93 #include <linux/ctype.h>
  94 #include <linux/mm_inline.h>
  95 #include <linux/mmu_notifier.h>
  96 #include <linux/printk.h>
  97
  98 #include <asm/tlbflush.h>
  99 #include <asm/uaccess.h>
 100 #include <linux/random.h>
 101
 102 #include "internal.h"
 103
 104 /* Internal flags */
 105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 107
 108 static struct kmem_cache *policy_cache;
 109 static struct kmem_cache *sn_cache;
 110
 111 /* Highest zone. An specific allocation for a zone below that is not
 112    policied. */
 113 enum zone_type policy_zone = 0;
 114
 115 /*
 116  * run-time system-wide default policy => local allocation
 117  */
 118 static struct mempolicy default_policy = {
 119         .refcnt = ATOMIC_INIT(1), /* never free it */
 120         .mode = MPOL_PREFERRED,
 121         .flags = MPOL_F_LOCAL,
 122 };
 123
 124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 125
 126 static struct mempolicy *get_task_policy(struct task_struct *p)
 127 {
 128         struct mempolicy *pol = p->mempolicy;
 129
 130         if (!pol) {
 131                 int node = numa_node_id();
 132
 133                 if (node != NUMA_NO_NODE) {
 134                         pol = &preferred_node_policy[node];
 135                         /*
 136                          * preferred_node_policy is not initialised early in
 137                          * boot
 138                          */
 139                         if (!pol->mode)
 140                                 pol = NULL;
 141                 }
 142         }
 143
 144         return pol;
 145 }
 146
 147 static const struct mempolicy_operations {
 148         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 149         /*
 150          * If read-side task has no lock to protect task->mempolicy, write-side
 151          * task will rebind the task->mempolicy by two step. The first step is
 152          * setting all the newly nodes, and the second step is cleaning all the
 153          * disallowed nodes. In this way, we can avoid finding no node to alloc
 154          * page.
 155          * If we have a lock to protect task->mempolicy in read-side, we do
 156          * rebind directly.
 157          *
 158          * step:
 159          *      MPOL_REBIND_ONCE - do rebind work at once
 160          *      MPOL_REBIND_STEP1 - set all the newly nodes
 161          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 162          */
 163         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 164                         enum mpol_rebind_step step);
 165 } mpol_ops[MPOL_MAX];
 166
 167 /* Check that the nodemask contains at least one populated zone */
 168 static int is_valid_nodemask(const nodemask_t *nodemask)
 169 {
 170         return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 171 }
 172
 173 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 174 {
 175         return pol->flags & MPOL_MODE_FLAGS;
 176 }
 177
 178 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 179                                    const nodemask_t *rel)
 180 {
 181         nodemask_t tmp;
 182         nodes_fold(tmp, *orig, nodes_weight(*rel));
 183         nodes_onto(*ret, tmp, *rel);
 184 }
 185
 186 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 187 {
 188         if (nodes_empty(*nodes))
 189                 return -EINVAL;
 190         pol->v.nodes = *nodes;
 191         return 0;
 192 }
 193
 194 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 195 {
 196         if (!nodes)
 197                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 198         else if (nodes_empty(*nodes))
 199                 return -EINVAL;                 /*  no allowed nodes */
 200         else
 201                 pol->v.preferred_node = first_node(*nodes);
 202         return 0;
 203 }
 204
 205 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 206 {
 207         if (!is_valid_nodemask(nodes))
 208                 return -EINVAL;
 209         pol->v.nodes = *nodes;
 210         return 0;
 211 }
 212
 213 /*
 214  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 215  * any, for the new policy.  mpol_new() has already validated the nodes
 216  * parameter with respect to the policy mode and flags.  But, we need to
 217  * handle an empty nodemask with MPOL_PREFERRED here.
 218  *
 219  * Must be called holding task's alloc_lock to protect task's mems_allowed
 220  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 221  */
 222 static int mpol_set_nodemask(struct mempolicy *pol,
 223                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 224 {
 225         int ret;
 226
 227         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 228         if (pol == NULL)
 229                 return 0;
 230         /* Check N_MEMORY */
 231         nodes_and(nsc->mask1,
 232                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 233
 234         VM_BUG_ON(!nodes);
 235         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 236                 nodes = NULL;   /* explicit local allocation */
 237         else {
 238                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 239                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 240                 else
 241                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 242
 243                 if (mpol_store_user_nodemask(pol))
 244                         pol->w.user_nodemask = *nodes;
 245                 else
 246                         pol->w.cpuset_mems_allowed =
 247                                                 cpuset_current_mems_allowed;
 248         }
 249
 250         if (nodes)
 251                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 252         else
 253                 ret = mpol_ops[pol->mode].create(pol, NULL);
 254         return ret;
 255 }
 256
 257 /*
 258  * This function just creates a new policy, does some check and simple
 259  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 260  */
 261 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 262                                   nodemask_t *nodes)
 263 {
 264         struct mempolicy *policy;
 265
 266         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 267                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 268
 269         if (mode == MPOL_DEFAULT) {
 270                 if (nodes && !nodes_empty(*nodes))
 271                         return ERR_PTR(-EINVAL);
 272                 return NULL;
 273         }
 274         VM_BUG_ON(!nodes);
 275
 276         /*
 277          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 278          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 279          * All other modes require a valid pointer to a non-empty nodemask.
 280          */
 281         if (mode == MPOL_PREFERRED) {
 282                 if (nodes_empty(*nodes)) {
 283                         if (((flags & MPOL_F_STATIC_NODES) ||
 284                              (flags & MPOL_F_RELATIVE_NODES)))
 285                                 return ERR_PTR(-EINVAL);
 286                 }
 287         } else if (mode == MPOL_LOCAL) {
 288                 if (!nodes_empty(*nodes))
 289                         return ERR_PTR(-EINVAL);
 290                 mode = MPOL_PREFERRED;
 291         } else if (nodes_empty(*nodes))
 292                 return ERR_PTR(-EINVAL);
 293         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 294         if (!policy)
 295                 return ERR_PTR(-ENOMEM);
 296         atomic_set(&policy->refcnt, 1);
 297         policy->mode = mode;
 298         policy->flags = flags;
 299
 300         return policy;
 301 }
 302
 303 /* Slow path of a mpol destructor. */
 304 void __mpol_put(struct mempolicy *p)
 305 {
 306         if (!atomic_dec_and_test(&p->refcnt))
 307                 return;
 308         kmem_cache_free(policy_cache, p);
 309 }
 310
 311 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 312                                 enum mpol_rebind_step step)
 313 {
 314 }
 315
 316 /*
 317  * step:
 318  *      MPOL_REBIND_ONCE  - do rebind work at once
 319  *      MPOL_REBIND_STEP1 - set all the newly nodes
 320  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 321  */
 322 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 323                                  enum mpol_rebind_step step)
 324 {
 325         nodemask_t tmp;
 326
 327         if (pol->flags & MPOL_F_STATIC_NODES)
 328                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 329         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 330                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 331         else {
 332                 /*
 333                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 334                  * result
 335                  */
 336                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 337                         nodes_remap(tmp, pol->v.nodes,
 338                                         pol->w.cpuset_mems_allowed, *nodes);
 339                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 340                 } else if (step == MPOL_REBIND_STEP2) {
 341                         tmp = pol->w.cpuset_mems_allowed;
 342                         pol->w.cpuset_mems_allowed = *nodes;
 343                 } else
 344                         BUG();
 345         }
 346
 347         if (nodes_empty(tmp))
 348                 tmp = *nodes;
 349
 350         if (step == MPOL_REBIND_STEP1)
 351                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 352         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 353                 pol->v.nodes = tmp;
 354         else
 355                 BUG();
 356
 357         if (!node_isset(current->il_next, tmp)) {
 358                 current->il_next = next_node(current->il_next, tmp);
 359                 if (current->il_next >= MAX_NUMNODES)
 360                         current->il_next = first_node(tmp);
 361                 if (current->il_next >= MAX_NUMNODES)
 362                         current->il_next = numa_node_id();
 363         }
 364 }
 365
 366 static void mpol_rebind_preferred(struct mempolicy *pol,
 367                                   const nodemask_t *nodes,
 368                                   enum mpol_rebind_step step)
 369 {
 370         nodemask_t tmp;
 371
 372         if (pol->flags & MPOL_F_STATIC_NODES) {
 373                 int node = first_node(pol->w.user_nodemask);
 374
 375                 if (node_isset(node, *nodes)) {
 376                         pol->v.preferred_node = node;
 377                         pol->flags &= ~MPOL_F_LOCAL;
 378                 } else
 379                         pol->flags |= MPOL_F_LOCAL;
 380         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 381                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 382                 pol->v.preferred_node = first_node(tmp);
 383         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 384                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 385                                                    pol->w.cpuset_mems_allowed,
 386                                                    *nodes);
 387                 pol->w.cpuset_mems_allowed = *nodes;
 388         }
 389 }
 390
 391 /*
 392  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 393  *
 394  * If read-side task has no lock to protect task->mempolicy, write-side
 395  * task will rebind the task->mempolicy by two step. The first step is
 396  * setting all the newly nodes, and the second step is cleaning all the
 397  * disallowed nodes. In this way, we can avoid finding no node to alloc
 398  * page.
 399  * If we have a lock to protect task->mempolicy in read-side, we do
 400  * rebind directly.
 401  *
 402  * step:
 403  *      MPOL_REBIND_ONCE  - do rebind work at once
 404  *      MPOL_REBIND_STEP1 - set all the newly nodes
 405  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 406  */
 407 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 408                                 enum mpol_rebind_step step)
 409 {
 410         if (!pol)
 411                 return;
 412         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 413             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 414                 return;
 415
 416         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 417                 return;
 418
 419         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 420                 BUG();
 421
 422         if (step == MPOL_REBIND_STEP1)
 423                 pol->flags |= MPOL_F_REBINDING;
 424         else if (step == MPOL_REBIND_STEP2)
 425                 pol->flags &= ~MPOL_F_REBINDING;
 426         else if (step >= MPOL_REBIND_NSTEP)
 427                 BUG();
 428
 429         mpol_ops[pol->mode].rebind(pol, newmask, step);
 430 }
 431
 432 /*
 433  * Wrapper for mpol_rebind_policy() that just requires task
 434  * pointer, and updates task mempolicy.
 435  *
 436  * Called with task's alloc_lock held.
 437  */
 438
 439 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 440                         enum mpol_rebind_step step)
 441 {
 442         mpol_rebind_policy(tsk->mempolicy, new, step);
 443 }
 444
 445 /*
 446  * Rebind each vma in mm to new nodemask.
 447  *
 448  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 449  */
 450
 451 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 452 {
 453         struct vm_area_struct *vma;
 454
 455         down_write(&mm->mmap_sem);
 456         for (vma = mm->mmap; vma; vma = vma->vm_next)
 457                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 458         up_write(&mm->mmap_sem);
 459 }
 460
 461 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 462         [MPOL_DEFAULT] = {
 463                 .rebind = mpol_rebind_default,
 464         },
 465         [MPOL_INTERLEAVE] = {
 466                 .create = mpol_new_interleave,
 467                 .rebind = mpol_rebind_nodemask,
 468         },
 469         [MPOL_PREFERRED] = {
 470                 .create = mpol_new_preferred,
 471                 .rebind = mpol_rebind_preferred,
 472         },
 473         [MPOL_BIND] = {
 474                 .create = mpol_new_bind,
 475                 .rebind = mpol_rebind_nodemask,
 476         },
 477 };
 478
 479 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 480                                 unsigned long flags);
 481
 482 /*
 483  * Scan through pages checking if pages follow certain conditions,
 484  * and move them to the pagelist if they do.
 485  */
 486 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 487                 unsigned long addr, unsigned long end,
 488                 const nodemask_t *nodes, unsigned long flags,
 489                 void *private)
 490 {
 491         pte_t *orig_pte;
 492         pte_t *pte;
 493         spinlock_t *ptl;
 494
 495         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 496         do {
 497                 struct page *page;
 498                 int nid;
 499
 500                 if (!pte_present(*pte))
 501                         continue;
 502                 page = vm_normal_page(vma, addr, *pte);
 503                 if (!page)
 504                         continue;
 505                 /*
 506                  * vm_normal_page() filters out zero pages, but there might
 507                  * still be PageReserved pages to skip, perhaps in a VDSO.
 508                  */
 509                 if (PageReserved(page))
 510                         continue;
 511                 nid = page_to_nid(page);
 512                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 513                         continue;
 514
 515                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 516                         migrate_page_add(page, private, flags);
 517                 else
 518                         break;
 519         } while (pte++, addr += PAGE_SIZE, addr != end);
 520         pte_unmap_unlock(orig_pte, ptl);
 521         return addr != end;
 522 }
 523
 524 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 525                 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
 526                                     void *private)
 527 {
 528 #ifdef CONFIG_HUGETLB_PAGE
 529         int nid;
 530         struct page *page;
 531         spinlock_t *ptl;
 532         pte_t entry;
 533
 534         ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 535         entry = huge_ptep_get((pte_t *)pmd);
 536         if (!pte_present(entry))
 537                 goto unlock;
 538         page = pte_page(entry);
 539         nid = page_to_nid(page);
 540         if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 541                 goto unlock;
 542         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 543         if (flags & (MPOL_MF_MOVE_ALL) ||
 544             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 545                 isolate_huge_page(page, private);
 546 unlock:
 547         spin_unlock(ptl);
 548 #else
 549         BUG();
 550 #endif
 551 }
 552
 553 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 554                 unsigned long addr, unsigned long end,
 555                 const nodemask_t *nodes, unsigned long flags,
 556                 void *private)
 557 {
 558         pmd_t *pmd;
 559         unsigned long next;
 560
 561         pmd = pmd_offset(pud, addr);
 562         do {
 563                 next = pmd_addr_end(addr, end);
 564                 if (!pmd_present(*pmd))
 565                         continue;
 566                 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
 567                         queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
 568                                                 flags, private);
 569                         continue;
 570                 }
 571                 split_huge_page_pmd(vma, addr, pmd);
 572                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 573                         continue;
 574                 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
 575                                     flags, private))
 576                         return -EIO;
 577         } while (pmd++, addr = next, addr != end);
 578         return 0;
 579 }
 580
 581 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 582                 unsigned long addr, unsigned long end,
 583                 const nodemask_t *nodes, unsigned long flags,
 584                 void *private)
 585 {
 586         pud_t *pud;
 587         unsigned long next;
 588
 589         pud = pud_offset(pgd, addr);
 590         do {
 591                 next = pud_addr_end(addr, end);
 592                 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
 593                         continue;
 594                 if (pud_none_or_clear_bad(pud))
 595                         continue;
 596                 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
 597                                     flags, private))
 598                         return -EIO;
 599         } while (pud++, addr = next, addr != end);
 600         return 0;
 601 }
 602
 603 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 604                 unsigned long addr, unsigned long end,
 605                 const nodemask_t *nodes, unsigned long flags,
 606                 void *private)
 607 {
 608         pgd_t *pgd;
 609         unsigned long next;
 610
 611         pgd = pgd_offset(vma->vm_mm, addr);
 612         do {
 613                 next = pgd_addr_end(addr, end);
 614                 if (pgd_none_or_clear_bad(pgd))
 615                         continue;
 616                 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
 617                                     flags, private))
 618                         return -EIO;
 619         } while (pgd++, addr = next, addr != end);
 620         return 0;
 621 }
 622
 623 #ifdef CONFIG_NUMA_BALANCING
 624 /*
 625  * This is used to mark a range of virtual addresses to be inaccessible.
 626  * These are later cleared by a NUMA hinting fault. Depending on these
 627  * faults, pages may be migrated for better NUMA placement.
 628  *
 629  * This is assuming that NUMA faults are handled using PROT_NONE. If
 630  * an architecture makes a different choice, it will need further
 631  * changes to the core.
 632  */
 633 unsigned long change_prot_numa(struct vm_area_struct *vma,
 634                         unsigned long addr, unsigned long end)
 635 {
 636         int nr_updated;
 637
 638         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 639         if (nr_updated)
 640                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 641
 642         return nr_updated;
 643 }
 644 #else
 645 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 646                         unsigned long addr, unsigned long end)
 647 {
 648         return 0;
 649 }
 650 #endif /* CONFIG_NUMA_BALANCING */
 651
 652 /*
 653  * Walk through page tables and collect pages to be migrated.
 654  *
 655  * If pages found in a given range are on a set of nodes (determined by
 656  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 657  * passed via @private.)
 658  */
 659 static int
 660 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 661                 const nodemask_t *nodes, unsigned long flags, void *private)
 662 {
 663         int err = 0;
 664         struct vm_area_struct *vma, *prev;
 665
 666         vma = find_vma(mm, start);
 667         if (!vma)
 668                 return -EFAULT;
 669         prev = NULL;
 670         for (; vma && vma->vm_start < end; vma = vma->vm_next) {
 671                 unsigned long endvma = vma->vm_end;
 672
 673                 if (endvma > end)
 674                         endvma = end;
 675                 if (vma->vm_start > start)
 676                         start = vma->vm_start;
 677
 678                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 679                         if (!vma->vm_next && vma->vm_end < end)
 680                                 return -EFAULT;
 681                         if (prev && prev->vm_end < vma->vm_start)
 682                                 return -EFAULT;
 683                 }
 684
 685                 if (flags & MPOL_MF_LAZY) {
 686                         change_prot_numa(vma, start, endvma);
 687                         goto next;
 688                 }
 689
 690                 if ((flags & MPOL_MF_STRICT) ||
 691                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 692                       vma_migratable(vma))) {
 693
 694                         err = queue_pages_pgd_range(vma, start, endvma, nodes,
 695                                                 flags, private);
 696                         if (err)
 697                                 break;
 698                 }
 699 next:
 700                 prev = vma;
 701         }
 702         return err;
 703 }
 704
 705 /*
 706  * Apply policy to a single VMA
 707  * This must be called with the mmap_sem held for writing.
 708  */
 709 static int vma_replace_policy(struct vm_area_struct *vma,
 710                                                 struct mempolicy *pol)
 711 {
 712         int err;
 713         struct mempolicy *old;
 714         struct mempolicy *new;
 715
 716         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 717                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 718                  vma->vm_ops, vma->vm_file,
 719                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 720
 721         new = mpol_dup(pol);
 722         if (IS_ERR(new))
 723                 return PTR_ERR(new);
 724
 725         if (vma->vm_ops && vma->vm_ops->set_policy) {
 726                 err = vma->vm_ops->set_policy(vma, new);
 727                 if (err)
 728                         goto err_out;
 729         }
 730
 731         old = vma->vm_policy;
 732         vma->vm_policy = new; /* protected by mmap_sem */
 733         mpol_put(old);
 734
 735         return 0;
 736  err_out:
 737         mpol_put(new);
 738         return err;
 739 }
 740
 741 /* Step 2: apply policy to a range and do splits. */
 742 static int mbind_range(struct mm_struct *mm, unsigned long start,
 743                        unsigned long end, struct mempolicy *new_pol)
 744 {
 745         struct vm_area_struct *next;
 746         struct vm_area_struct *prev;
 747         struct vm_area_struct *vma;
 748         int err = 0;
 749         pgoff_t pgoff;
 750         unsigned long vmstart;
 751         unsigned long vmend;
 752
 753         vma = find_vma(mm, start);
 754         if (!vma || vma->vm_start > start)
 755                 return -EFAULT;
 756
 757         prev = vma->vm_prev;
 758         if (start > vma->vm_start)
 759                 prev = vma;
 760
 761         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 762                 next = vma->vm_next;
 763                 vmstart = max(start, vma->vm_start);
 764                 vmend   = min(end, vma->vm_end);
 765
 766                 if (mpol_equal(vma_policy(vma), new_pol))
 767                         continue;
 768
 769                 pgoff = vma->vm_pgoff +
 770                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 771                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 772                                   vma->anon_vma, vma->vm_file, pgoff,
 773                                   new_pol);
 774                 if (prev) {
 775                         vma = prev;
 776                         next = vma->vm_next;
 777                         if (mpol_equal(vma_policy(vma), new_pol))
 778                                 continue;
 779                         /* vma_merge() joined vma && vma->next, case 8 */
 780                         goto replace;
 781                 }
 782                 if (vma->vm_start != vmstart) {
 783                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 784                         if (err)
 785                                 goto out;
 786                 }
 787                 if (vma->vm_end != vmend) {
 788                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 789                         if (err)
 790                                 goto out;
 791                 }
 792  replace:
 793                 err = vma_replace_policy(vma, new_pol);
 794                 if (err)
 795                         goto out;
 796         }
 797
 798  out:
 799         return err;
 800 }
 801
 802 /* Set the process memory policy */
 803 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 804                              nodemask_t *nodes)
 805 {
 806         struct mempolicy *new, *old;
 807         struct mm_struct *mm = current->mm;
 808         NODEMASK_SCRATCH(scratch);
 809         int ret;
 810
 811         if (!scratch)
 812                 return -ENOMEM;
 813
 814         new = mpol_new(mode, flags, nodes);
 815         if (IS_ERR(new)) {
 816                 ret = PTR_ERR(new);
 817                 goto out;
 818         }
 819         /*
 820          * prevent changing our mempolicy while show_numa_maps()
 821          * is using it.
 822          * Note:  do_set_mempolicy() can be called at init time
 823          * with no 'mm'.
 824          */
 825         if (mm)
 826                 down_write(&mm->mmap_sem);
 827         task_lock(current);
 828         ret = mpol_set_nodemask(new, nodes, scratch);
 829         if (ret) {
 830                 task_unlock(current);
 831                 if (mm)
 832                         up_write(&mm->mmap_sem);
 833                 mpol_put(new);
 834                 goto out;
 835         }
 836         old = current->mempolicy;
 837         current->mempolicy = new;
 838         if (new && new->mode == MPOL_INTERLEAVE &&
 839             nodes_weight(new->v.nodes))
 840                 current->il_next = first_node(new->v.nodes);
 841         task_unlock(current);
 842         if (mm)
 843                 up_write(&mm->mmap_sem);
 844
 845         mpol_put(old);
 846         ret = 0;
 847 out:
 848         NODEMASK_SCRATCH_FREE(scratch);
 849         return ret;
 850 }
 851
 852 /*
 853  * Return nodemask for policy for get_mempolicy() query
 854  *
 855  * Called with task's alloc_lock held
 856  */
 857 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 858 {
 859         nodes_clear(*nodes);
 860         if (p == &default_policy)
 861                 return;
 862
 863         switch (p->mode) {
 864         case MPOL_BIND:
 865                 /* Fall through */
 866         case MPOL_INTERLEAVE:
 867                 *nodes = p->v.nodes;
 868                 break;
 869         case MPOL_PREFERRED:
 870                 if (!(p->flags & MPOL_F_LOCAL))
 871                         node_set(p->v.preferred_node, *nodes);
 872                 /* else return empty node mask for local allocation */
 873                 break;
 874         default:
 875                 BUG();
 876         }
 877 }
 878
 879 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 880 {
 881         struct page *p;
 882         int err;
 883
 884         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 885         if (err >= 0) {
 886                 err = page_to_nid(p);
 887                 put_page(p);
 888         }
 889         return err;
 890 }
 891
 892 /* Retrieve NUMA policy */
 893 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 894                              unsigned long addr, unsigned long flags)
 895 {
 896         int err;
 897         struct mm_struct *mm = current->mm;
 898         struct vm_area_struct *vma = NULL;
 899         struct mempolicy *pol = current->mempolicy;
 900
 901         if (flags &
 902                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 903                 return -EINVAL;
 904
 905         if (flags & MPOL_F_MEMS_ALLOWED) {
 906                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 907                         return -EINVAL;
 908                 *policy = 0;    /* just so it's initialized */
 909                 task_lock(current);
 910                 *nmask  = cpuset_current_mems_allowed;
 911                 task_unlock(current);
 912                 return 0;
 913         }
 914
 915         if (flags & MPOL_F_ADDR) {
 916                 /*
 917                  * Do NOT fall back to task policy if the
 918                  * vma/shared policy at addr is NULL.  We
 919                  * want to return MPOL_DEFAULT in this case.
 920                  */
 921                 down_read(&mm->mmap_sem);
 922                 vma = find_vma_intersection(mm, addr, addr+1);
 923                 if (!vma) {
 924                         up_read(&mm->mmap_sem);
 925                         return -EFAULT;
 926                 }
 927                 if (vma->vm_ops && vma->vm_ops->get_policy)
 928                         pol = vma->vm_ops->get_policy(vma, addr);
 929                 else
 930                         pol = vma->vm_policy;
 931         } else if (addr)
 932                 return -EINVAL;
 933
 934         if (!pol)
 935                 pol = &default_policy;  /* indicates default behavior */
 936
 937         if (flags & MPOL_F_NODE) {
 938                 if (flags & MPOL_F_ADDR) {
 939                         err = lookup_node(mm, addr);
 940                         if (err < 0)
 941                                 goto out;
 942                         *policy = err;
 943                 } else if (pol == current->mempolicy &&
 944                                 pol->mode == MPOL_INTERLEAVE) {
 945                         *policy = current->il_next;
 946                 } else {
 947                         err = -EINVAL;
 948                         goto out;
 949                 }
 950         } else {
 951                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 952                                                 pol->mode;
 953                 /*
 954                  * Internal mempolicy flags must be masked off before exposing
 955                  * the policy to userspace.
 956                  */
 957                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 958         }
 959
 960         if (vma) {
 961                 up_read(&current->mm->mmap_sem);
 962                 vma = NULL;
 963         }
 964
 965         err = 0;
 966         if (nmask) {
 967                 if (mpol_store_user_nodemask(pol)) {
 968                         *nmask = pol->w.user_nodemask;
 969                 } else {
 970                         task_lock(current);
 971                         get_policy_nodemask(pol, nmask);
 972                         task_unlock(current);
 973                 }
 974         }
 975
 976  out:
 977         mpol_cond_put(pol);
 978         if (vma)
 979                 up_read(&current->mm->mmap_sem);
 980         return err;
 981 }
 982
 983 #ifdef CONFIG_MIGRATION
 984 /*
 985  * page migration
 986  */
 987 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 988                                 unsigned long flags)
 989 {
 990         /*
 991          * Avoid migrating a page that is shared with others.
 992          */
 993         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 994                 if (!isolate_lru_page(page)) {
 995                         list_add_tail(&page->lru, pagelist);
 996                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 997                                             page_is_file_cache(page));
 998                 }
 999         }
1000 }
1001
1002 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1003 {
1004         if (PageHuge(page))
1005                 return alloc_huge_page_node(page_hstate(compound_head(page)),
1006                                         node);
1007         else
1008                 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1009 }
1010
1011 /*
1012  * Migrate pages from one node to a target node.
1013  * Returns error or the number of pages not migrated.
1014  */
1015 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1016                            int flags)
1017 {
1018         nodemask_t nmask;
1019         LIST_HEAD(pagelist);
1020         int err = 0;
1021
1022         nodes_clear(nmask);
1023         node_set(source, nmask);
1024
1025         /*
1026          * This does not "check" the range but isolates all pages that
1027          * need migration.  Between passing in the full user address
1028          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1029          */
1030         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1031         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1032                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1033
1034         if (!list_empty(&pagelist)) {
1035                 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1036                                         MIGRATE_SYNC, MR_SYSCALL);
1037                 if (err)
1038                         putback_movable_pages(&pagelist);
1039         }
1040
1041         return err;
1042 }
1043
1044 /*
1045  * Move pages between the two nodesets so as to preserve the physical
1046  * layout as much as possible.
1047  *
1048  * Returns the number of page that could not be moved.
1049  */
1050 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1051                      const nodemask_t *to, int flags)
1052 {
1053         int busy = 0;
1054         int err;
1055         nodemask_t tmp;
1056
1057         err = migrate_prep();
1058         if (err)
1059                 return err;
1060
1061         down_read(&mm->mmap_sem);
1062
1063         err = migrate_vmas(mm, from, to, flags);
1064         if (err)
1065                 goto out;
1066
1067         /*
1068          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1069          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1070          * bit in 'tmp', and return that <source, dest> pair for migration.
1071          * The pair of nodemasks 'to' and 'from' define the map.
1072          *
1073          * If no pair of bits is found that way, fallback to picking some
1074          * pair of 'source' and 'dest' bits that are not the same.  If the
1075          * 'source' and 'dest' bits are the same, this represents a node
1076          * that will be migrating to itself, so no pages need move.
1077          *
1078          * If no bits are left in 'tmp', or if all remaining bits left
1079          * in 'tmp' correspond to the same bit in 'to', return false
1080          * (nothing left to migrate).
1081          *
1082          * This lets us pick a pair of nodes to migrate between, such that
1083          * if possible the dest node is not already occupied by some other
1084          * source node, minimizing the risk of overloading the memory on a
1085          * node that would happen if we migrated incoming memory to a node
1086          * before migrating outgoing memory source that same node.
1087          *
1088          * A single scan of tmp is sufficient.  As we go, we remember the
1089          * most recent <s, d> pair that moved (s != d).  If we find a pair
1090          * that not only moved, but what's better, moved to an empty slot
1091          * (d is not set in tmp), then we break out then, with that pair.
1092          * Otherwise when we finish scanning from_tmp, we at least have the
1093          * most recent <s, d> pair that moved.  If we get all the way through
1094          * the scan of tmp without finding any node that moved, much less
1095          * moved to an empty node, then there is nothing left worth migrating.
1096          */
1097
1098         tmp = *from;
1099         while (!nodes_empty(tmp)) {
1100                 int s,d;
1101                 int source = NUMA_NO_NODE;
1102                 int dest = 0;
1103
1104                 for_each_node_mask(s, tmp) {
1105
1106                         /*
1107                          * do_migrate_pages() tries to maintain the relative
1108                          * node relationship of the pages established between
1109                          * threads and memory areas.
1110                          *
1111                          * However if the number of source nodes is not equal to
1112                          * the number of destination nodes we can not preserve
1113                          * this node relative relationship.  In that case, skip
1114                          * copying memory from a node that is in the destination
1115                          * mask.
1116                          *
1117                          * Example: [2,3,4] -> [3,4,5] moves everything.
1118                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1119                          */
1120
1121                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1122                                                 (node_isset(s, *to)))
1123                                 continue;
1124
1125                         d = node_remap(s, *from, *to);
1126                         if (s == d)
1127                                 continue;
1128
1129                         source = s;     /* Node moved. Memorize */
1130                         dest = d;
1131
1132                         /* dest not in remaining from nodes? */
1133                         if (!node_isset(dest, tmp))
1134                                 break;
1135                 }
1136                 if (source == NUMA_NO_NODE)
1137                         break;
1138
1139                 node_clear(source, tmp);
1140                 err = migrate_to_node(mm, source, dest, flags);
1141                 if (err > 0)
1142                         busy += err;
1143                 if (err < 0)
1144                         break;
1145         }
1146 out:
1147         up_read(&mm->mmap_sem);
1148         if (err < 0)
1149                 return err;
1150         return busy;
1151
1152 }
1153
1154 /*
1155  * Allocate a new page for page migration based on vma policy.
1156  * Start by assuming the page is mapped by the same vma as contains @start.
1157  * Search forward from there, if not.  N.B., this assumes that the
1158  * list of pages handed to migrate_pages()--which is how we get here--
1159  * is in virtual address order.
1160  */
1161 static struct page *new_page(struct page *page, unsigned long start, int **x)
1162 {
1163         struct vm_area_struct *vma;
1164         unsigned long uninitialized_var(address);
1165
1166         vma = find_vma(current->mm, start);
1167         while (vma) {
1168                 address = page_address_in_vma(page, vma);
1169                 if (address != -EFAULT)
1170                         break;
1171                 vma = vma->vm_next;
1172         }
1173
1174         if (PageHuge(page)) {
1175                 BUG_ON(!vma);
1176                 return alloc_huge_page_noerr(vma, address, 1);
1177         }
1178         /*
1179          * if !vma, alloc_page_vma() will use task or system default policy
1180          */
1181         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1182 }
1183 #else
1184
1185 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1186                                 unsigned long flags)
1187 {
1188 }
1189
1190 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1191                      const nodemask_t *to, int flags)
1192 {
1193         return -ENOSYS;
1194 }
1195
1196 static struct page *new_page(struct page *page, unsigned long start, int **x)
1197 {
1198         return NULL;
1199 }
1200 #endif
1201
1202 static long do_mbind(unsigned long start, unsigned long len,
1203                      unsigned short mode, unsigned short mode_flags,
1204                      nodemask_t *nmask, unsigned long flags)
1205 {
1206         struct mm_struct *mm = current->mm;
1207         struct mempolicy *new;
1208         unsigned long end;
1209         int err;
1210         LIST_HEAD(pagelist);
1211
1212         if (flags & ~(unsigned long)MPOL_MF_VALID)
1213                 return -EINVAL;
1214         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1215                 return -EPERM;
1216
1217         if (start & ~PAGE_MASK)
1218                 return -EINVAL;
1219
1220         if (mode == MPOL_DEFAULT)
1221                 flags &= ~MPOL_MF_STRICT;
1222
1223         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1224         end = start + len;
1225
1226         if (end < start)
1227                 return -EINVAL;
1228         if (end == start)
1229                 return 0;
1230
1231         new = mpol_new(mode, mode_flags, nmask);
1232         if (IS_ERR(new))
1233                 return PTR_ERR(new);
1234
1235         if (flags & MPOL_MF_LAZY)
1236                 new->flags |= MPOL_F_MOF;
1237
1238         /*
1239          * If we are using the default policy then operation
1240          * on discontinuous address spaces is okay after all
1241          */
1242         if (!new)
1243                 flags |= MPOL_MF_DISCONTIG_OK;
1244
1245         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1246                  start, start + len, mode, mode_flags,
1247                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1248
1249         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1250
1251                 err = migrate_prep();
1252                 if (err)
1253                         goto mpol_out;
1254         }
1255         {
1256                 NODEMASK_SCRATCH(scratch);
1257                 if (scratch) {
1258                         down_write(&mm->mmap_sem);
1259                         task_lock(current);
1260                         err = mpol_set_nodemask(new, nmask, scratch);
1261                         task_unlock(current);
1262                         if (err)
1263                                 up_write(&mm->mmap_sem);
1264                 } else
1265                         err = -ENOMEM;
1266                 NODEMASK_SCRATCH_FREE(scratch);
1267         }
1268         if (err)
1269                 goto mpol_out;
1270
1271         err = queue_pages_range(mm, start, end, nmask,
1272                           flags | MPOL_MF_INVERT, &pagelist);
1273         if (!err)
1274                 err = mbind_range(mm, start, end, new);
1275
1276         if (!err) {
1277                 int nr_failed = 0;
1278
1279                 if (!list_empty(&pagelist)) {
1280                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1281                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1282                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1283                         if (nr_failed)
1284                                 putback_movable_pages(&pagelist);
1285                 }
1286
1287                 if (nr_failed && (flags & MPOL_MF_STRICT))
1288                         err = -EIO;
1289         } else
1290                 putback_movable_pages(&pagelist);
1291
1292         up_write(&mm->mmap_sem);
1293  mpol_out:
1294         mpol_put(new);
1295         return err;
1296 }
1297
1298 /*
1299  * User space interface with variable sized bitmaps for nodelists.
1300  */
1301
1302 /* Copy a node mask from user space. */
1303 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1304                      unsigned long maxnode)
1305 {
1306         unsigned long k;
1307         unsigned long nlongs;
1308         unsigned long endmask;
1309
1310         --maxnode;
1311         nodes_clear(*nodes);
1312         if (maxnode == 0 || !nmask)
1313                 return 0;
1314         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1315                 return -EINVAL;
1316
1317         nlongs = BITS_TO_LONGS(maxnode);
1318         if ((maxnode % BITS_PER_LONG) == 0)
1319                 endmask = ~0UL;
1320         else
1321                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1322
1323         /* When the user specified more nodes than supported just check
1324            if the non supported part is all zero. */
1325         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1326                 if (nlongs > PAGE_SIZE/sizeof(long))
1327                         return -EINVAL;
1328                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1329                         unsigned long t;
1330                         if (get_user(t, nmask + k))
1331                                 return -EFAULT;
1332                         if (k == nlongs - 1) {
1333                                 if (t & endmask)
1334                                         return -EINVAL;
1335                         } else if (t)
1336                                 return -EINVAL;
1337                 }
1338                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1339                 endmask = ~0UL;
1340         }
1341
1342         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1343                 return -EFAULT;
1344         nodes_addr(*nodes)[nlongs-1] &= endmask;
1345         return 0;
1346 }
1347
1348 /* Copy a kernel node mask to user space */
1349 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1350                               nodemask_t *nodes)
1351 {
1352         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1353         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1354
1355         if (copy > nbytes) {
1356                 if (copy > PAGE_SIZE)
1357                         return -EINVAL;
1358                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1359                         return -EFAULT;
1360                 copy = nbytes;
1361         }
1362         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1363 }
1364
1365 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1366                 unsigned long, mode, const unsigned long __user *, nmask,
1367                 unsigned long, maxnode, unsigned, flags)
1368 {
1369         nodemask_t nodes;
1370         int err;
1371         unsigned short mode_flags;
1372
1373         mode_flags = mode & MPOL_MODE_FLAGS;
1374         mode &= ~MPOL_MODE_FLAGS;
1375         if (mode >= MPOL_MAX)
1376                 return -EINVAL;
1377         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1378             (mode_flags & MPOL_F_RELATIVE_NODES))
1379                 return -EINVAL;
1380         err = get_nodes(&nodes, nmask, maxnode);
1381         if (err)
1382                 return err;
1383         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1384 }
1385
1386 /* Set the process memory policy */
1387 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1388                 unsigned long, maxnode)
1389 {
1390         int err;
1391         nodemask_t nodes;
1392         unsigned short flags;
1393
1394         flags = mode & MPOL_MODE_FLAGS;
1395         mode &= ~MPOL_MODE_FLAGS;
1396         if ((unsigned int)mode >= MPOL_MAX)
1397                 return -EINVAL;
1398         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1399                 return -EINVAL;
1400         err = get_nodes(&nodes, nmask, maxnode);
1401         if (err)
1402                 return err;
1403         return do_set_mempolicy(mode, flags, &nodes);
1404 }
1405
1406 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1407                 const unsigned long __user *, old_nodes,
1408                 const unsigned long __user *, new_nodes)
1409 {
1410         const struct cred *cred = current_cred(), *tcred;
1411         struct mm_struct *mm = NULL;
1412         struct task_struct *task;
1413         nodemask_t task_nodes;
1414         int err;
1415         nodemask_t *old;
1416         nodemask_t *new;
1417         NODEMASK_SCRATCH(scratch);
1418
1419         if (!scratch)
1420                 return -ENOMEM;
1421
1422         old = &scratch->mask1;
1423         new = &scratch->mask2;
1424
1425         err = get_nodes(old, old_nodes, maxnode);
1426         if (err)
1427                 goto out;
1428
1429         err = get_nodes(new, new_nodes, maxnode);
1430         if (err)
1431                 goto out;
1432
1433         /* Find the mm_struct */
1434         rcu_read_lock();
1435         task = pid ? find_task_by_vpid(pid) : current;
1436         if (!task) {
1437                 rcu_read_unlock();
1438                 err = -ESRCH;
1439                 goto out;
1440         }
1441         get_task_struct(task);
1442
1443         err = -EINVAL;
1444
1445         /*
1446          * Check if this process has the right to modify the specified
1447          * process. The right exists if the process has administrative
1448          * capabilities, superuser privileges or the same
1449          * userid as the target process.
1450          */
1451         tcred = __task_cred(task);
1452         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1453             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1454             !capable(CAP_SYS_NICE)) {
1455                 rcu_read_unlock();
1456                 err = -EPERM;
1457                 goto out_put;
1458         }
1459         rcu_read_unlock();
1460
1461         task_nodes = cpuset_mems_allowed(task);
1462         /* Is the user allowed to access the target nodes? */
1463         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1464                 err = -EPERM;
1465                 goto out_put;
1466         }
1467
1468         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1469                 err = -EINVAL;
1470                 goto out_put;
1471         }
1472
1473         err = security_task_movememory(task);
1474         if (err)
1475                 goto out_put;
1476
1477         mm = get_task_mm(task);
1478         put_task_struct(task);
1479
1480         if (!mm) {
1481                 err = -EINVAL;
1482                 goto out;
1483         }
1484
1485         err = do_migrate_pages(mm, old, new,
1486                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1487
1488         mmput(mm);
1489 out:
1490         NODEMASK_SCRATCH_FREE(scratch);
1491
1492         return err;
1493
1494 out_put:
1495         put_task_struct(task);
1496         goto out;
1497
1498 }
1499
1500
1501 /* Retrieve NUMA policy */
1502 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1503                 unsigned long __user *, nmask, unsigned long, maxnode,
1504                 unsigned long, addr, unsigned long, flags)
1505 {
1506         int err;
1507         int uninitialized_var(pval);
1508         nodemask_t nodes;
1509
1510         if (nmask != NULL && maxnode < MAX_NUMNODES)
1511                 return -EINVAL;
1512
1513         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1514
1515         if (err)
1516                 return err;
1517
1518         if (policy && put_user(pval, policy))
1519                 return -EFAULT;
1520
1521         if (nmask)
1522                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1523
1524         return err;
1525 }
1526
1527 #ifdef CONFIG_COMPAT
1528
1529 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1530                        compat_ulong_t __user *, nmask,
1531                        compat_ulong_t, maxnode,
1532                        compat_ulong_t, addr, compat_ulong_t, flags)
1533 {
1534         long err;
1535         unsigned long __user *nm = NULL;
1536         unsigned long nr_bits, alloc_size;
1537         DECLARE_BITMAP(bm, MAX_NUMNODES);
1538
1539         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1540         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1541
1542         if (nmask)
1543                 nm = compat_alloc_user_space(alloc_size);
1544
1545         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1546
1547         if (!err && nmask) {
1548                 unsigned long copy_size;
1549                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1550                 err = copy_from_user(bm, nm, copy_size);
1551                 /* ensure entire bitmap is zeroed */
1552                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1553                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1554         }
1555
1556         return err;
1557 }
1558
1559 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1560                        compat_ulong_t, maxnode)
1561 {
1562         long err = 0;
1563         unsigned long __user *nm = NULL;
1564         unsigned long nr_bits, alloc_size;
1565         DECLARE_BITMAP(bm, MAX_NUMNODES);
1566
1567         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1568         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1569
1570         if (nmask) {
1571                 err = compat_get_bitmap(bm, nmask, nr_bits);
1572                 nm = compat_alloc_user_space(alloc_size);
1573                 err |= copy_to_user(nm, bm, alloc_size);
1574         }
1575
1576         if (err)
1577                 return -EFAULT;
1578
1579         return sys_set_mempolicy(mode, nm, nr_bits+1);
1580 }
1581
1582 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1583                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1584                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1585 {
1586         long err = 0;
1587         unsigned long __user *nm = NULL;
1588         unsigned long nr_bits, alloc_size;
1589         nodemask_t bm;
1590
1591         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1592         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1593
1594         if (nmask) {
1595                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1596                 nm = compat_alloc_user_space(alloc_size);
1597                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1598         }
1599
1600         if (err)
1601                 return -EFAULT;
1602
1603         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1604 }
1605
1606 #endif
1607
1608 /*
1609  * get_vma_policy(@task, @vma, @addr)
1610  * @task: task for fallback if vma policy == default
1611  * @vma: virtual memory area whose policy is sought
1612  * @addr: address in @vma for shared policy lookup
1613  *
1614  * Returns effective policy for a VMA at specified address.
1615  * Falls back to @task or system default policy, as necessary.
1616  * Current or other task's task mempolicy and non-shared vma policies must be
1617  * protected by task_lock(task) by the caller.
1618  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1619  * count--added by the get_policy() vm_op, as appropriate--to protect against
1620  * freeing by another task.  It is the caller's responsibility to free the
1621  * extra reference for shared policies.
1622  */
1623 struct mempolicy *get_vma_policy(struct task_struct *task,
1624                 struct vm_area_struct *vma, unsigned long addr)
1625 {
1626         struct mempolicy *pol = get_task_policy(task);
1627
1628         if (vma) {
1629                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1630                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1631                                                                         addr);
1632                         if (vpol)
1633                                 pol = vpol;
1634                 } else if (vma->vm_policy) {
1635                         pol = vma->vm_policy;
1636
1637                         /*
1638                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1639                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1640                          * count on these policies which will be dropped by
1641                          * mpol_cond_put() later
1642                          */
1643                         if (mpol_needs_cond_ref(pol))
1644                                 mpol_get(pol);
1645                 }
1646         }
1647         if (!pol)
1648                 pol = &default_policy;
1649         return pol;
1650 }
1651
1652 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1653 {
1654         struct mempolicy *pol = get_task_policy(task);
1655         if (vma) {
1656                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1657                         bool ret = false;
1658
1659                         pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1660                         if (pol && (pol->flags & MPOL_F_MOF))
1661                                 ret = true;
1662                         mpol_cond_put(pol);
1663
1664                         return ret;
1665                 } else if (vma->vm_policy) {
1666                         pol = vma->vm_policy;
1667                 }
1668         }
1669
1670         if (!pol)
1671                 return default_policy.flags & MPOL_F_MOF;
1672
1673         return pol->flags & MPOL_F_MOF;
1674 }
1675
1676 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1677 {
1678         enum zone_type dynamic_policy_zone = policy_zone;
1679
1680         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1681
1682         /*
1683          * if policy->v.nodes has movable memory only,
1684          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1685          *
1686          * policy->v.nodes is intersect with node_states[N_MEMORY].
1687          * so if the following test faile, it implies
1688          * policy->v.nodes has movable memory only.
1689          */
1690         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1691                 dynamic_policy_zone = ZONE_MOVABLE;
1692
1693         return zone >= dynamic_policy_zone;
1694 }
1695
1696 /*
1697  * Return a nodemask representing a mempolicy for filtering nodes for
1698  * page allocation
1699  */
1700 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1701 {
1702         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1703         if (unlikely(policy->mode == MPOL_BIND) &&
1704                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1705                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1706                 return &policy->v.nodes;
1707
1708         return NULL;
1709 }
1710
1711 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1712 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1713         int nd)
1714 {
1715         switch (policy->mode) {
1716         case MPOL_PREFERRED:
1717                 if (!(policy->flags & MPOL_F_LOCAL))
1718                         nd = policy->v.preferred_node;
1719                 break;
1720         case MPOL_BIND:
1721                 /*
1722                  * Normally, MPOL_BIND allocations are node-local within the
1723                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1724                  * current node isn't part of the mask, we use the zonelist for
1725                  * the first node in the mask instead.
1726                  */
1727                 if (unlikely(gfp & __GFP_THISNODE) &&
1728                                 unlikely(!node_isset(nd, policy->v.nodes)))
1729                         nd = first_node(policy->v.nodes);
1730                 break;
1731         default:
1732                 BUG();
1733         }
1734         return node_zonelist(nd, gfp);
1735 }
1736
1737 /* Do dynamic interleaving for a process */
1738 static unsigned interleave_nodes(struct mempolicy *policy)
1739 {
1740         unsigned nid, next;
1741         struct task_struct *me = current;
1742
1743         nid = me->il_next;
1744         next = next_node(nid, policy->v.nodes);
1745         if (next >= MAX_NUMNODES)
1746                 next = first_node(policy->v.nodes);
1747         if (next < MAX_NUMNODES)
1748                 me->il_next = next;
1749         return nid;
1750 }
1751
1752 /*
1753  * Depending on the memory policy provide a node from which to allocate the
1754  * next slab entry.
1755  */
1756 unsigned int mempolicy_slab_node(void)
1757 {
1758         struct mempolicy *policy;
1759         int node = numa_mem_id();
1760
1761         if (in_interrupt())
1762                 return node;
1763
1764         policy = current->mempolicy;
1765         if (!policy || policy->flags & MPOL_F_LOCAL)
1766                 return node;
1767
1768         switch (policy->mode) {
1769         case MPOL_PREFERRED:
1770                 /*
1771                  * handled MPOL_F_LOCAL above
1772                  */
1773                 return policy->v.preferred_node;
1774
1775         case MPOL_INTERLEAVE:
1776                 return interleave_nodes(policy);
1777
1778         case MPOL_BIND: {
1779                 /*
1780                  * Follow bind policy behavior and start allocation at the
1781                  * first node.
1782                  */
1783                 struct zonelist *zonelist;
1784                 struct zone *zone;
1785                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1786                 zonelist = &NODE_DATA(node)->node_zonelists[0];
1787                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1788                                                         &policy->v.nodes,
1789                                                         &zone);
1790                 return zone ? zone->node : node;
1791         }
1792
1793         default:
1794                 BUG();
1795         }
1796 }
1797
1798 /* Do static interleaving for a VMA with known offset. */
1799 static unsigned offset_il_node(struct mempolicy *pol,
1800                 struct vm_area_struct *vma, unsigned long off)
1801 {
1802         unsigned nnodes = nodes_weight(pol->v.nodes);
1803         unsigned target;
1804         int c;
1805         int nid = NUMA_NO_NODE;
1806
1807         if (!nnodes)
1808                 return numa_node_id();
1809         target = (unsigned int)off % nnodes;
1810         c = 0;
1811         do {
1812                 nid = next_node(nid, pol->v.nodes);
1813                 c++;
1814         } while (c <= target);
1815         return nid;
1816 }
1817
1818 /* Determine a node number for interleave */
1819 static inline unsigned interleave_nid(struct mempolicy *pol,
1820                  struct vm_area_struct *vma, unsigned long addr, int shift)
1821 {
1822         if (vma) {
1823                 unsigned long off;
1824
1825                 /*
1826                  * for small pages, there is no difference between
1827                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1828                  * for huge pages, since vm_pgoff is in units of small
1829                  * pages, we need to shift off the always 0 bits to get
1830                  * a useful offset.
1831                  */
1832                 BUG_ON(shift < PAGE_SHIFT);
1833                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1834                 off += (addr - vma->vm_start) >> shift;
1835                 return offset_il_node(pol, vma, off);
1836         } else
1837                 return interleave_nodes(pol);
1838 }
1839
1840 /*
1841  * Return the bit number of a random bit set in the nodemask.
1842  * (returns NUMA_NO_NODE if nodemask is empty)
1843  */
1844 int node_random(const nodemask_t *maskp)
1845 {
1846         int w, bit = NUMA_NO_NODE;
1847
1848         w = nodes_weight(*maskp);
1849         if (w)
1850                 bit = bitmap_ord_to_pos(maskp->bits,
1851                         get_random_int() % w, MAX_NUMNODES);
1852         return bit;
1853 }
1854
1855 #ifdef CONFIG_HUGETLBFS
1856 /*
1857  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1858  * @vma: virtual memory area whose policy is sought
1859  * @addr: address in @vma for shared policy lookup and interleave policy
1860  * @gfp_flags: for requested zone
1861  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1862  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1863  *
1864  * Returns a zonelist suitable for a huge page allocation and a pointer
1865  * to the struct mempolicy for conditional unref after allocation.
1866  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1867  * @nodemask for filtering the zonelist.
1868  *
1869  * Must be protected by read_mems_allowed_begin()
1870  */
1871 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1872                                 gfp_t gfp_flags, struct mempolicy **mpol,
1873                                 nodemask_t **nodemask)
1874 {
1875         struct zonelist *zl;
1876
1877         *mpol = get_vma_policy(current, vma, addr);
1878         *nodemask = NULL;       /* assume !MPOL_BIND */
1879
1880         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1881                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1882                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1883         } else {
1884                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1885                 if ((*mpol)->mode == MPOL_BIND)
1886                         *nodemask = &(*mpol)->v.nodes;
1887         }
1888         return zl;
1889 }
1890
1891 /*
1892  * init_nodemask_of_mempolicy
1893  *
1894  * If the current task's mempolicy is "default" [NULL], return 'false'
1895  * to indicate default policy.  Otherwise, extract the policy nodemask
1896  * for 'bind' or 'interleave' policy into the argument nodemask, or
1897  * initialize the argument nodemask to contain the single node for
1898  * 'preferred' or 'local' policy and return 'true' to indicate presence
1899  * of non-default mempolicy.
1900  *
1901  * We don't bother with reference counting the mempolicy [mpol_get/put]
1902  * because the current task is examining it's own mempolicy and a task's
1903  * mempolicy is only ever changed by the task itself.
1904  *
1905  * N.B., it is the caller's responsibility to free a returned nodemask.
1906  */
1907 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1908 {
1909         struct mempolicy *mempolicy;
1910         int nid;
1911
1912         if (!(mask && current->mempolicy))
1913                 return false;
1914
1915         task_lock(current);
1916         mempolicy = current->mempolicy;
1917         switch (mempolicy->mode) {
1918         case MPOL_PREFERRED:
1919                 if (mempolicy->flags & MPOL_F_LOCAL)
1920                         nid = numa_node_id();
1921                 else
1922                         nid = mempolicy->v.preferred_node;
1923                 init_nodemask_of_node(mask, nid);
1924                 break;
1925
1926         case MPOL_BIND:
1927                 /* Fall through */
1928         case MPOL_INTERLEAVE:
1929                 *mask =  mempolicy->v.nodes;
1930                 break;
1931
1932         default:
1933                 BUG();
1934         }
1935         task_unlock(current);
1936
1937         return true;
1938 }
1939 #endif
1940
1941 /*
1942  * mempolicy_nodemask_intersects
1943  *
1944  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1945  * policy.  Otherwise, check for intersection between mask and the policy
1946  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1947  * policy, always return true since it may allocate elsewhere on fallback.
1948  *
1949  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1950  */
1951 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1952                                         const nodemask_t *mask)
1953 {
1954         struct mempolicy *mempolicy;
1955         bool ret = true;
1956
1957         if (!mask)
1958                 return ret;
1959         task_lock(tsk);
1960         mempolicy = tsk->mempolicy;
1961         if (!mempolicy)
1962                 goto out;
1963
1964         switch (mempolicy->mode) {
1965         case MPOL_PREFERRED:
1966                 /*
1967                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1968                  * allocate from, they may fallback to other nodes when oom.
1969                  * Thus, it's possible for tsk to have allocated memory from
1970                  * nodes in mask.
1971                  */
1972                 break;
1973         case MPOL_BIND:
1974         case MPOL_INTERLEAVE:
1975                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1976                 break;
1977         default:
1978                 BUG();
1979         }
1980 out:
1981         task_unlock(tsk);
1982         return ret;
1983 }
1984
1985 /* Allocate a page in interleaved policy.
1986    Own path because it needs to do special accounting. */
1987 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1988                                         unsigned nid)
1989 {
1990         struct zonelist *zl;
1991         struct page *page;
1992
1993         zl = node_zonelist(nid, gfp);
1994         page = __alloc_pages(gfp, order, zl);
1995         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1996                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1997         return page;
1998 }
1999
2000 /**
2001  *      alloc_pages_vma - Allocate a page for a VMA.
2002  *
2003  *      @gfp:
2004  *      %GFP_USER    user allocation.
2005  *      %GFP_KERNEL  kernel allocations,
2006  *      %GFP_HIGHMEM highmem/user allocations,
2007  *      %GFP_FS      allocation should not call back into a file system.
2008  *      %GFP_ATOMIC  don't sleep.
2009  *
2010  *      @order:Order of the GFP allocation.
2011  *      @vma:  Pointer to VMA or NULL if not available.
2012  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2013  *
2014  *      This function allocates a page from the kernel page pool and applies
2015  *      a NUMA policy associated with the VMA or the current process.
2016  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2017  *      mm_struct of the VMA to prevent it from going away. Should be used for
2018  *      all allocations for pages that will be mapped into
2019  *      user space. Returns NULL when no page can be allocated.
2020  *
2021  *      Should be called with the mm_sem of the vma hold.
2022  */
2023 struct page *
2024 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2025                 unsigned long addr, int node)
2026 {
2027         struct mempolicy *pol;
2028         struct page *page;
2029         unsigned int cpuset_mems_cookie;
2030
2031 retry_cpuset:
2032         pol = get_vma_policy(current, vma, addr);
2033         cpuset_mems_cookie = read_mems_allowed_begin();
2034
2035         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2036                 unsigned nid;
2037
2038                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2039                 mpol_cond_put(pol);
2040                 page = alloc_page_interleave(gfp, order, nid);
2041                 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2042                         goto retry_cpuset;
2043
2044                 return page;
2045         }
2046         page = __alloc_pages_nodemask(gfp, order,
2047                                       policy_zonelist(gfp, pol, node),
2048                                       policy_nodemask(gfp, pol));
2049         if (unlikely(mpol_needs_cond_ref(pol)))
2050                 __mpol_put(pol);
2051         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2052                 goto retry_cpuset;
2053         return page;
2054 }
2055
2056 /**
2057  *      alloc_pages_current - Allocate pages.
2058  *
2059  *      @gfp:
2060  *              %GFP_USER   user allocation,
2061  *              %GFP_KERNEL kernel allocation,
2062  *              %GFP_HIGHMEM highmem allocation,
2063  *              %GFP_FS     don't call back into a file system.
2064  *              %GFP_ATOMIC don't sleep.
2065  *      @order: Power of two of allocation size in pages. 0 is a single page.
2066  *
2067  *      Allocate a page from the kernel page pool.  When not in
2068  *      interrupt context and apply the current process NUMA policy.
2069  *      Returns NULL when no page can be allocated.
2070  *
2071  *      Don't call cpuset_update_task_memory_state() unless
2072  *      1) it's ok to take cpuset_sem (can WAIT), and
2073  *      2) allocating for current task (not interrupt).
2074  */
2075 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2076 {
2077         struct mempolicy *pol = get_task_policy(current);
2078         struct page *page;
2079         unsigned int cpuset_mems_cookie;
2080
2081         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2082                 pol = &default_policy;
2083
2084 retry_cpuset:
2085         cpuset_mems_cookie = read_mems_allowed_begin();
2086
2087         /*
2088          * No reference counting needed for current->mempolicy
2089          * nor system default_policy
2090          */
2091         if (pol->mode == MPOL_INTERLEAVE)
2092                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2093         else
2094                 page = __alloc_pages_nodemask(gfp, order,
2095                                 policy_zonelist(gfp, pol, numa_node_id()),
2096                                 policy_nodemask(gfp, pol));
2097
2098         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2099                 goto retry_cpuset;
2100
2101         return page;
2102 }
2103 EXPORT_SYMBOL(alloc_pages_current);
2104
2105 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2106 {
2107         struct mempolicy *pol = mpol_dup(vma_policy(src));
2108
2109         if (IS_ERR(pol))
2110                 return PTR_ERR(pol);
2111         dst->vm_policy = pol;
2112         return 0;
2113 }
2114
2115 /*
2116  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2117  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2118  * with the mems_allowed returned by cpuset_mems_allowed().  This
2119  * keeps mempolicies cpuset relative after its cpuset moves.  See
2120  * further kernel/cpuset.c update_nodemask().
2121  *
2122  * current's mempolicy may be rebinded by the other task(the task that changes
2123  * cpuset's mems), so we needn't do rebind work for current task.
2124  */
2125
2126 /* Slow path of a mempolicy duplicate */
2127 struct mempolicy *__mpol_dup(struct mempolicy *old)
2128 {
2129         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2130
2131         if (!new)
2132                 return ERR_PTR(-ENOMEM);
2133
2134         /* task's mempolicy is protected by alloc_lock */
2135         if (old == current->mempolicy) {
2136                 task_lock(current);
2137                 *new = *old;
2138                 task_unlock(current);
2139         } else
2140                 *new = *old;
2141
2142         if (current_cpuset_is_being_rebound()) {
2143                 nodemask_t mems = cpuset_mems_allowed(current);
2144                 if (new->flags & MPOL_F_REBINDING)
2145                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2146                 else
2147                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2148         }
2149         atomic_set(&new->refcnt, 1);
2150         return new;
2151 }
2152
2153 /* Slow path of a mempolicy comparison */
2154 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2155 {
2156         if (!a || !b)
2157                 return false;
2158         if (a->mode != b->mode)
2159                 return false;
2160         if (a->flags != b->flags)
2161                 return false;
2162         if (mpol_store_user_nodemask(a))
2163                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2164                         return false;
2165
2166         switch (a->mode) {
2167         case MPOL_BIND:
2168                 /* Fall through */
2169         case MPOL_INTERLEAVE:
2170                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2171         case MPOL_PREFERRED:
2172                 return a->v.preferred_node == b->v.preferred_node;
2173         default:
2174                 BUG();
2175                 return false;
2176         }
2177 }
2178
2179 /*
2180  * Shared memory backing store policy support.
2181  *
2182  * Remember policies even when nobody has shared memory mapped.
2183  * The policies are kept in Red-Black tree linked from the inode.
2184  * They are protected by the sp->lock spinlock, which should be held
2185  * for any accesses to the tree.
2186  */
2187
2188 /* lookup first element intersecting start-end */
2189 /* Caller holds sp->lock */
2190 static struct sp_node *
2191 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2192 {
2193         struct rb_node *n = sp->root.rb_node;
2194
2195         while (n) {
2196                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2197
2198                 if (start >= p->end)
2199                         n = n->rb_right;
2200                 else if (end <= p->start)
2201                         n = n->rb_left;
2202                 else
2203                         break;
2204         }
2205         if (!n)
2206                 return NULL;
2207         for (;;) {
2208                 struct sp_node *w = NULL;
2209                 struct rb_node *prev = rb_prev(n);
2210                 if (!prev)
2211                         break;
2212                 w = rb_entry(prev, struct sp_node, nd);
2213                 if (w->end <= start)
2214                         break;
2215                 n = prev;
2216         }
2217         return rb_entry(n, struct sp_node, nd);
2218 }
2219
2220 /* Insert a new shared policy into the list. */
2221 /* Caller holds sp->lock */
2222 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2223 {
2224         struct rb_node **p = &sp->root.rb_node;
2225         struct rb_node *parent = NULL;
2226         struct sp_node *nd;
2227
2228         while (*p) {
2229                 parent = *p;
2230                 nd = rb_entry(parent, struct sp_node, nd);
2231                 if (new->start < nd->start)
2232                         p = &(*p)->rb_left;
2233                 else if (new->end > nd->end)
2234                         p = &(*p)->rb_right;
2235                 else
2236                         BUG();
2237         }
2238         rb_link_node(&new->nd, parent, p);
2239         rb_insert_color(&new->nd, &sp->root);
2240         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2241                  new->policy ? new->policy->mode : 0);
2242 }
2243
2244 /* Find shared policy intersecting idx */
2245 struct mempolicy *
2246 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2247 {
2248         struct mempolicy *pol = NULL;
2249         struct sp_node *sn;
2250
2251         if (!sp->root.rb_node)
2252                 return NULL;
2253         spin_lock(&sp->lock);
2254         sn = sp_lookup(sp, idx, idx+1);
2255         if (sn) {
2256                 mpol_get(sn->policy);
2257                 pol = sn->policy;
2258         }
2259         spin_unlock(&sp->lock);
2260         return pol;
2261 }
2262
2263 static void sp_free(struct sp_node *n)
2264 {
2265         mpol_put(n->policy);
2266         kmem_cache_free(sn_cache, n);
2267 }
2268
2269 /**
2270  * mpol_misplaced - check whether current page node is valid in policy
2271  *
2272  * @page: page to be checked
2273  * @vma: vm area where page mapped
2274  * @addr: virtual address where page mapped
2275  *
2276  * Lookup current policy node id for vma,addr and "compare to" page's
2277  * node id.
2278  *
2279  * Returns:
2280  *      -1      - not misplaced, page is in the right node
2281  *      node    - node id where the page should be
2282  *
2283  * Policy determination "mimics" alloc_page_vma().
2284  * Called from fault path where we know the vma and faulting address.
2285  */
2286 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2287 {
2288         struct mempolicy *pol;
2289         struct zone *zone;
2290         int curnid = page_to_nid(page);
2291         unsigned long pgoff;
2292         int thiscpu = raw_smp_processor_id();
2293         int thisnid = cpu_to_node(thiscpu);
2294         int polnid = -1;
2295         int ret = -1;
2296
2297         BUG_ON(!vma);
2298
2299         pol = get_vma_policy(current, vma, addr);
2300         if (!(pol->flags & MPOL_F_MOF))
2301                 goto out;
2302
2303         switch (pol->mode) {
2304         case MPOL_INTERLEAVE:
2305                 BUG_ON(addr >= vma->vm_end);
2306                 BUG_ON(addr < vma->vm_start);
2307
2308                 pgoff = vma->vm_pgoff;
2309                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2310                 polnid = offset_il_node(pol, vma, pgoff);
2311                 break;
2312
2313         case MPOL_PREFERRED:
2314                 if (pol->flags & MPOL_F_LOCAL)
2315                         polnid = numa_node_id();
2316                 else
2317                         polnid = pol->v.preferred_node;
2318                 break;
2319
2320         case MPOL_BIND:
2321                 /*
2322                  * allows binding to multiple nodes.
2323                  * use current page if in policy nodemask,
2324                  * else select nearest allowed node, if any.
2325                  * If no allowed nodes, use current [!misplaced].
2326                  */
2327                 if (node_isset(curnid, pol->v.nodes))
2328                         goto out;
2329                 (void)first_zones_zonelist(
2330                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2331                                 gfp_zone(GFP_HIGHUSER),
2332                                 &pol->v.nodes, &zone);
2333                 polnid = zone->node;
2334                 break;
2335
2336         default:
2337                 BUG();
2338         }
2339
2340         /* Migrate the page towards the node whose CPU is referencing it */
2341         if (pol->flags & MPOL_F_MORON) {
2342                 polnid = thisnid;
2343
2344                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2345                         goto out;
2346         }
2347
2348         if (curnid != polnid)
2349                 ret = polnid;
2350 out:
2351         mpol_cond_put(pol);
2352
2353         return ret;
2354 }
2355
2356 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2357 {
2358         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2359         rb_erase(&n->nd, &sp->root);
2360         sp_free(n);
2361 }
2362
2363 static void sp_node_init(struct sp_node *node, unsigned long start,
2364                         unsigned long end, struct mempolicy *pol)
2365 {
2366         node->start = start;
2367         node->end = end;
2368         node->policy = pol;
2369 }
2370
2371 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2372                                 struct mempolicy *pol)
2373 {
2374         struct sp_node *n;
2375         struct mempolicy *newpol;
2376
2377         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2378         if (!n)
2379                 return NULL;
2380
2381         newpol = mpol_dup(pol);
2382         if (IS_ERR(newpol)) {
2383                 kmem_cache_free(sn_cache, n);
2384                 return NULL;
2385         }
2386         newpol->flags |= MPOL_F_SHARED;
2387         sp_node_init(n, start, end, newpol);
2388
2389         return n;
2390 }
2391
2392 /* Replace a policy range. */
2393 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2394                                  unsigned long end, struct sp_node *new)
2395 {
2396         struct sp_node *n;
2397         struct sp_node *n_new = NULL;
2398         struct mempolicy *mpol_new = NULL;
2399         int ret = 0;
2400
2401 restart:
2402         spin_lock(&sp->lock);
2403         n = sp_lookup(sp, start, end);
2404         /* Take care of old policies in the same range. */
2405         while (n && n->start < end) {
2406                 struct rb_node *next = rb_next(&n->nd);
2407                 if (n->start >= start) {
2408                         if (n->end <= end)
2409                                 sp_delete(sp, n);
2410                         else
2411                                 n->start = end;
2412                 } else {
2413                         /* Old policy spanning whole new range. */
2414                         if (n->end > end) {
2415                                 if (!n_new)
2416                                         goto alloc_new;
2417
2418                                 *mpol_new = *n->policy;
2419                                 atomic_set(&mpol_new->refcnt, 1);
2420                                 sp_node_init(n_new, end, n->end, mpol_new);
2421                                 n->end = start;
2422                                 sp_insert(sp, n_new);
2423                                 n_new = NULL;
2424                                 mpol_new = NULL;
2425                                 break;
2426                         } else
2427                                 n->end = start;
2428                 }
2429                 if (!next)
2430                         break;
2431                 n = rb_entry(next, struct sp_node, nd);
2432         }
2433         if (new)
2434                 sp_insert(sp, new);
2435         spin_unlock(&sp->lock);
2436         ret = 0;
2437
2438 err_out:
2439         if (mpol_new)
2440                 mpol_put(mpol_new);
2441         if (n_new)
2442                 kmem_cache_free(sn_cache, n_new);
2443
2444         return ret;
2445
2446 alloc_new:
2447         spin_unlock(&sp->lock);
2448         ret = -ENOMEM;
2449         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2450         if (!n_new)
2451                 goto err_out;
2452         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2453         if (!mpol_new)
2454                 goto err_out;
2455         goto restart;
2456 }
2457
2458 /**
2459  * mpol_shared_policy_init - initialize shared policy for inode
2460  * @sp: pointer to inode shared policy
2461  * @mpol:  struct mempolicy to install
2462  *
2463  * Install non-NULL @mpol in inode's shared policy rb-tree.
2464  * On entry, the current task has a reference on a non-NULL @mpol.
2465  * This must be released on exit.
2466  * This is called at get_inode() calls and we can use GFP_KERNEL.
2467  */
2468 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2469 {
2470         int ret;
2471
2472         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2473         spin_lock_init(&sp->lock);
2474
2475         if (mpol) {
2476                 struct vm_area_struct pvma;
2477                 struct mempolicy *new;
2478                 NODEMASK_SCRATCH(scratch);
2479
2480                 if (!scratch)
2481                         goto put_mpol;
2482                 /* contextualize the tmpfs mount point mempolicy */
2483                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2484                 if (IS_ERR(new))
2485                         goto free_scratch; /* no valid nodemask intersection */
2486
2487                 task_lock(current);
2488                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2489                 task_unlock(current);
2490                 if (ret)
2491                         goto put_new;
2492
2493                 /* Create pseudo-vma that contains just the policy */
2494                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2495                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2496                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2497
2498 put_new:
2499                 mpol_put(new);                  /* drop initial ref */
2500 free_scratch:
2501                 NODEMASK_SCRATCH_FREE(scratch);
2502 put_mpol:
2503                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2504         }
2505 }
2506
2507 int mpol_set_shared_policy(struct shared_policy *info,
2508                         struct vm_area_struct *vma, struct mempolicy *npol)
2509 {
2510         int err;
2511         struct sp_node *new = NULL;
2512         unsigned long sz = vma_pages(vma);
2513
2514         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2515                  vma->vm_pgoff,
2516                  sz, npol ? npol->mode : -1,
2517                  npol ? npol->flags : -1,
2518                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2519
2520         if (npol) {
2521                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2522                 if (!new)
2523                         return -ENOMEM;
2524         }
2525         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2526         if (err && new)
2527                 sp_free(new);
2528         return err;
2529 }
2530
2531 /* Free a backing policy store on inode delete. */
2532 void mpol_free_shared_policy(struct shared_policy *p)
2533 {
2534         struct sp_node *n;
2535         struct rb_node *next;
2536
2537         if (!p->root.rb_node)
2538                 return;
2539         spin_lock(&p->lock);
2540         next = rb_first(&p->root);
2541         while (next) {
2542                 n = rb_entry(next, struct sp_node, nd);
2543                 next = rb_next(&n->nd);
2544                 sp_delete(p, n);
2545         }
2546         spin_unlock(&p->lock);
2547 }
2548
2549 #ifdef CONFIG_NUMA_BALANCING
2550 static int __initdata numabalancing_override;
2551
2552 static void __init check_numabalancing_enable(void)
2553 {
2554         bool numabalancing_default = false;
2555
2556         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2557                 numabalancing_default = true;
2558
2559         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2560         if (numabalancing_override)
2561                 set_numabalancing_state(numabalancing_override == 1);
2562
2563         if (nr_node_ids > 1 && !numabalancing_override) {
2564                 pr_info("%s automatic NUMA balancing. "
2565                         "Configure with numa_balancing= or the "
2566                         "kernel.numa_balancing sysctl",
2567                         numabalancing_default ? "Enabling" : "Disabling");
2568                 set_numabalancing_state(numabalancing_default);
2569         }
2570 }
2571
2572 static int __init setup_numabalancing(char *str)
2573 {
2574         int ret = 0;
2575         if (!str)
2576                 goto out;
2577
2578         if (!strcmp(str, "enable")) {
2579                 numabalancing_override = 1;
2580                 ret = 1;
2581         } else if (!strcmp(str, "disable")) {
2582                 numabalancing_override = -1;
2583                 ret = 1;
2584         }
2585 out:
2586         if (!ret)
2587                 pr_warn("Unable to parse numa_balancing=\n");
2588
2589         return ret;
2590 }
2591 __setup("numa_balancing=", setup_numabalancing);
2592 #else
2593 static inline void __init check_numabalancing_enable(void)
2594 {
2595 }
2596 #endif /* CONFIG_NUMA_BALANCING */
2597
2598 /* assumes fs == KERNEL_DS */
2599 void __init numa_policy_init(void)
2600 {
2601         nodemask_t interleave_nodes;
2602         unsigned long largest = 0;
2603         int nid, prefer = 0;
2604
2605         policy_cache = kmem_cache_create("numa_policy",
2606                                          sizeof(struct mempolicy),
2607                                          0, SLAB_PANIC, NULL);
2608
2609         sn_cache = kmem_cache_create("shared_policy_node",
2610                                      sizeof(struct sp_node),
2611                                      0, SLAB_PANIC, NULL);
2612
2613         for_each_node(nid) {
2614                 preferred_node_policy[nid] = (struct mempolicy) {
2615                         .refcnt = ATOMIC_INIT(1),
2616                         .mode = MPOL_PREFERRED,
2617                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2618                         .v = { .preferred_node = nid, },
2619                 };
2620         }
2621
2622         /*
2623          * Set interleaving policy for system init. Interleaving is only
2624          * enabled across suitably sized nodes (default is >= 16MB), or
2625          * fall back to the largest node if they're all smaller.
2626          */
2627         nodes_clear(interleave_nodes);
2628         for_each_node_state(nid, N_MEMORY) {
2629                 unsigned long total_pages = node_present_pages(nid);
2630
2631                 /* Preserve the largest node */
2632                 if (largest < total_pages) {
2633                         largest = total_pages;
2634                         prefer = nid;
2635                 }
2636
2637                 /* Interleave this node? */
2638                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2639                         node_set(nid, interleave_nodes);
2640         }
2641
2642         /* All too small, use the largest */
2643         if (unlikely(nodes_empty(interleave_nodes)))
2644                 node_set(prefer, interleave_nodes);
2645
2646         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2647                 pr_err("%s: interleaving failed\n", __func__);
2648
2649         check_numabalancing_enable();
2650 }
2651
2652 /* Reset policy of current process to default */
2653 void numa_default_policy(void)
2654 {
2655         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2656 }
2657
2658 /*
2659  * Parse and format mempolicy from/to strings
2660  */
2661
2662 /*
2663  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2664  */
2665 static const char * const policy_modes[] =
2666 {
2667         [MPOL_DEFAULT]    = "default",
2668         [MPOL_PREFERRED]  = "prefer",
2669         [MPOL_BIND]       = "bind",
2670         [MPOL_INTERLEAVE] = "interleave",
2671         [MPOL_LOCAL]      = "local",
2672 };
2673
2674
2675 #ifdef CONFIG_TMPFS
2676 /**
2677  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2678  * @str:  string containing mempolicy to parse
2679  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2680  *
2681  * Format of input:
2682  *      <mode>[=<flags>][:<nodelist>]
2683  *
2684  * On success, returns 0, else 1
2685  */
2686 int mpol_parse_str(char *str, struct mempolicy **mpol)
2687 {
2688         struct mempolicy *new = NULL;
2689         unsigned short mode;
2690         unsigned short mode_flags;
2691         nodemask_t nodes;
2692         char *nodelist = strchr(str, ':');
2693         char *flags = strchr(str, '=');
2694         int err = 1;
2695
2696         if (nodelist) {
2697                 /* NUL-terminate mode or flags string */
2698                 *nodelist++ = '\0';
2699                 if (nodelist_parse(nodelist, nodes))
2700                         goto out;
2701                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2702                         goto out;
2703         } else
2704                 nodes_clear(nodes);
2705
2706         if (flags)
2707                 *flags++ = '\0';        /* terminate mode string */
2708
2709         for (mode = 0; mode < MPOL_MAX; mode++) {
2710                 if (!strcmp(str, policy_modes[mode])) {
2711                         break;
2712                 }
2713         }
2714         if (mode >= MPOL_MAX)
2715                 goto out;
2716
2717         switch (mode) {
2718         case MPOL_PREFERRED:
2719                 /*
2720                  * Insist on a nodelist of one node only
2721                  */
2722                 if (nodelist) {
2723                         char *rest = nodelist;
2724                         while (isdigit(*rest))
2725                                 rest++;
2726                         if (*rest)
2727                                 goto out;
2728                 }
2729                 break;
2730         case MPOL_INTERLEAVE:
2731                 /*
2732                  * Default to online nodes with memory if no nodelist
2733                  */
2734                 if (!nodelist)
2735                         nodes = node_states[N_MEMORY];
2736                 break;
2737         case MPOL_LOCAL:
2738                 /*
2739                  * Don't allow a nodelist;  mpol_new() checks flags
2740                  */
2741                 if (nodelist)
2742                         goto out;
2743                 mode = MPOL_PREFERRED;
2744                 break;
2745         case MPOL_DEFAULT:
2746                 /*
2747                  * Insist on a empty nodelist
2748                  */
2749                 if (!nodelist)
2750                         err = 0;
2751                 goto out;
2752         case MPOL_BIND:
2753                 /*
2754                  * Insist on a nodelist
2755                  */
2756                 if (!nodelist)
2757                         goto out;
2758         }
2759
2760         mode_flags = 0;
2761         if (flags) {
2762                 /*
2763                  * Currently, we only support two mutually exclusive
2764                  * mode flags.
2765                  */
2766                 if (!strcmp(flags, "static"))
2767                         mode_flags |= MPOL_F_STATIC_NODES;
2768                 else if (!strcmp(flags, "relative"))
2769                         mode_flags |= MPOL_F_RELATIVE_NODES;
2770                 else
2771                         goto out;
2772         }
2773
2774         new = mpol_new(mode, mode_flags, &nodes);
2775         if (IS_ERR(new))
2776                 goto out;
2777
2778         /*
2779          * Save nodes for mpol_to_str() to show the tmpfs mount options
2780          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2781          */
2782         if (mode != MPOL_PREFERRED)
2783                 new->v.nodes = nodes;
2784         else if (nodelist)
2785                 new->v.preferred_node = first_node(nodes);
2786         else
2787                 new->flags |= MPOL_F_LOCAL;
2788
2789         /*
2790          * Save nodes for contextualization: this will be used to "clone"
2791          * the mempolicy in a specific context [cpuset] at a later time.
2792          */
2793         new->w.user_nodemask = nodes;
2794
2795         err = 0;
2796
2797 out:
2798         /* Restore string for error message */
2799         if (nodelist)
2800                 *--nodelist = ':';
2801         if (flags)
2802                 *--flags = '=';
2803         if (!err)
2804                 *mpol = new;
2805         return err;
2806 }
2807 #endif /* CONFIG_TMPFS */
2808
2809 /**
2810  * mpol_to_str - format a mempolicy structure for printing
2811  * @buffer:  to contain formatted mempolicy string
2812  * @maxlen:  length of @buffer
2813  * @pol:  pointer to mempolicy to be formatted
2814  *
2815  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2816  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2817  * longest flag, "relative", and to display at least a few node ids.
2818  */
2819 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2820 {
2821         char *p = buffer;
2822         nodemask_t nodes = NODE_MASK_NONE;
2823         unsigned short mode = MPOL_DEFAULT;
2824         unsigned short flags = 0;
2825
2826         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2827                 mode = pol->mode;
2828                 flags = pol->flags;
2829         }
2830
2831         switch (mode) {
2832         case MPOL_DEFAULT:
2833                 break;
2834         case MPOL_PREFERRED:
2835                 if (flags & MPOL_F_LOCAL)
2836                         mode = MPOL_LOCAL;
2837                 else
2838                         node_set(pol->v.preferred_node, nodes);
2839                 break;
2840         case MPOL_BIND:
2841         case MPOL_INTERLEAVE:
2842                 nodes = pol->v.nodes;
2843                 break;
2844         default:
2845                 WARN_ON_ONCE(1);
2846                 snprintf(p, maxlen, "unknown");
2847                 return;
2848         }
2849
2850         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2851
2852         if (flags & MPOL_MODE_FLAGS) {
2853                 p += snprintf(p, buffer + maxlen - p, "=");
2854
2855                 /*
2856                  * Currently, the only defined flags are mutually exclusive
2857                  */
2858                 if (flags & MPOL_F_STATIC_NODES)
2859                         p += snprintf(p, buffer + maxlen - p, "static");
2860                 else if (flags & MPOL_F_RELATIVE_NODES)
2861                         p += snprintf(p, buffer + maxlen - p, "relative");
2862         }
2863
2864         if (!nodes_empty(nodes)) {
2865                 p += snprintf(p, buffer + maxlen - p, ":");
2866                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2867         }
2868 }